Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance API error handling & add coverage #206

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
7 changes: 7 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ jobs:
run: |
UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock

- name: 📥 Download NLTK Data
run: |
# Download required NLTK resources
rye run python libs/megaparse/scripts/download_nltk_data.py

- name: 🚀 Run tests
run: |
# Run both API (megaparse) and SDK tests
rye test -p megaparse
rye test -p megaparse-sdk
17 changes: 16 additions & 1 deletion libs/megaparse/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ dependencies = [
"onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'",
"onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'",
"pypdfium2>=4.30.0",
"pytest-xdist>=3.6.1",
]

[project.optional-dependencies]
Expand All @@ -54,9 +55,23 @@ build-backend = "hatchling.build"

[tool.rye]
managed = true
dev-dependencies = []
dev-dependencies = [
"pytest>=7.4.0",
"pytest-asyncio>=0.23.0",
"pytest-timeout>=2.1.0",
"pytest-xdist>=3.6.1",
"httpx>=0.24.1"
]
universal = true

[tool.pytest.ini_options]
asyncio_mode = "strict"
asyncio_default_fixture_loop_scope = "function"
timeout = 30
testpaths = ["tests"]
python_files = ["test_*.py"]
addopts = "-v --tb=short"

[tool.hatch.metadata]
allow-direct-references = true

Expand Down
23 changes: 23 additions & 0 deletions libs/megaparse/scripts/download_nltk_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Script to download required NLTK resources."""

import nltk


def download_nltk_resources():
"""Download required NLTK resources."""
resources = [
"punkt",
"punkt_tab",
"averaged_perceptron_tagger",
"averaged_perceptron_tagger_eng",
"maxent_ne_chunker",
"words",
"stopwords",
]
for resource in resources:
print(f"Downloading {resource}...")
nltk.download(resource)


if __name__ == "__main__":
download_nltk_resources()
120 changes: 87 additions & 33 deletions libs/megaparse/src/megaparse/api/app.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import asyncio
import io
import os
import tempfile
from typing import Optional
from urllib.parse import urlparse

import httpx
import psutil
Expand All @@ -22,7 +24,7 @@
from megaparse.api.exceptions.megaparse_exceptions import (
HTTPDownloadError,
HTTPFileNotFound,
HTTPModelNotSupported,
HTTPMemoryError,
HTTPParsingException,
ParsingException,
)
Expand Down Expand Up @@ -67,28 +69,32 @@ async def parse_file(
check_table: bool = Form(False),
language: Language = Form(Language.ENGLISH),
parsing_instruction: Optional[str] = Form(None),
model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O),
model_name: Optional[str] = Form(SupportedModel.GPT_4O.value),
parser_builder=Depends(parser_builder_dep),
) -> dict[str, str]:
if not _check_free_memory():
raise HTTPException(
status_code=503, detail="Service unavailable due to low memory"
)
raise HTTPMemoryError()
model = None
if model_name and check_table:
if model_name.startswith("gpt"):
model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
elif model_name.startswith("claude"):
supported_models = [model.value for model in SupportedModel]
if model_name not in supported_models:
raise HTTPException(
status_code=501,
detail=f"Model {model_name} is not supported. Please use one of {supported_models}",
)
model_name_str = model_name
if model_name_str.startswith("gpt"):
model = ChatOpenAI(
model=model_name_str, api_key=os.getenv("OPENAI_API_KEY")
) # type: ignore
elif model_name_str.startswith("claude"):
model = ChatAnthropic(
model_name=model_name,
model_name=model_name_str,
api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore
timeout=60,
stop=None,
)

else:
raise HTTPModelNotSupported()

parser_config = ParseFileConfig(
method=method,
strategy=strategy,
Expand Down Expand Up @@ -123,15 +129,51 @@ async def parse_file(
async def upload_url(
url: str, playwright_loader=Depends(get_playwright_loader)
) -> dict[str, str]:
if not _check_free_memory():
raise HTTPMemoryError()

# Validate URL format
result = urlparse(url)
if not all([result.scheme, result.netloc]):
raise HTTPException(
status_code=400, detail="Failed to load website content: Invalid URL format"
)

playwright_loader.urls = [url]

if url.endswith(".pdf"):
## Download the file

async with httpx.AsyncClient() as client:
response = await client.get(url)
if response.status_code != 200:
raise HTTPDownloadError(url)
## Download the file with retry logic
max_retries = 3
for attempt in range(max_retries):
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.get(url)
response.raise_for_status()
break
except (
httpx.RequestError,
httpx.HTTPStatusError,
TimeoutError,
ConnectionError,
) as e:
if isinstance(e, (httpx.TimeoutException, TimeoutError)):
if attempt == max_retries - 1:
raise HTTPException(
status_code=504,
detail=f"Request timed out after {max_retries} attempts",
)
elif isinstance(e, ConnectionError):
if attempt == max_retries - 1:
raise HTTPException(
status_code=429,
detail=f"Failed after {max_retries} attempts: {str(e)}",
)
elif attempt == max_retries - 1:
raise HTTPException(
status_code=400,
detail=f"Failed to load website content: {str(e)}",
)
await asyncio.sleep(2**attempt) # Exponential backoff

with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
temp_file.write(response.content)
Expand All @@ -141,23 +183,35 @@ async def upload_url(
)
result = await megaparse.aload(temp_file.name)
return {"message": "File parsed successfully", "result": result}
except ParsingException:
raise HTTPParsingException(url)
except ParsingException as e:
raise HTTPParsingException(url, message=str(e))
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Internal server error while parsing PDF: {str(e)}",
)
else:
data = await playwright_loader.aload()
# Now turn the data into a string
extracted_content = ""
for page in data:
extracted_content += page.page_content
if not extracted_content:
raise HTTPDownloadError(
url,
message="Failed to extract content from the website. Valid URL example : https://www.quivr.com",
try:
data = await playwright_loader.aload()
# Now turn the data into a string
extracted_content = ""
for page in data:
extracted_content += page.page_content
if not extracted_content:
raise HTTPDownloadError(
url,
message="Failed to extract content from the website. Valid URL example : https://www.quivr.com",
)
return {
"message": "Website content parsed successfully",
"result": extracted_content,
}
except Exception as e:
# Handle Playwright-specific errors
raise HTTPException(
status_code=400,
detail=f"Failed to load website content: {str(e)}. Make sure the URL is valid and accessible.",
)
return {
"message": "Website content parsed successfully",
"result": extracted_content,
}


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,37 @@
from fastapi import HTTPException


class HTTPTimeoutError(HTTPException):
def __init__(
self,
url: str,
message: str = "Request timed out",
headers: dict | None = None,
):
detail = f"{url}: {message}"
super().__init__(status_code=504, detail=detail, headers=headers)


class HTTPMemoryError(HTTPException):
def __init__(
self,
message: str = "Service unavailable due to low memory",
headers: dict | None = None,
):
super().__init__(status_code=503, detail=message, headers=headers)


class HTTPTooManyRequestsError(HTTPException):
def __init__(
self,
url: str,
message: str = "Too many retry attempts",
headers: dict | None = None,
):
detail = f"{url}: {message}"
super().__init__(status_code=429, detail=detail, headers=headers)


class HTTPModelNotSupported(HTTPException):
def __init__(
self,
Expand Down
1 change: 0 additions & 1 deletion libs/megaparse/src/megaparse/megaparse.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import asyncio
import logging
import os
from pathlib import Path
Expand Down
1 change: 0 additions & 1 deletion libs/megaparse/src/megaparse/parser/llama.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import asyncio
from pathlib import Path
from typing import IO, List

Expand Down
2 changes: 1 addition & 1 deletion libs/megaparse/src/megaparse/parser/megaparse_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from io import BytesIO
from pathlib import Path
from typing import IO, List, Union
from typing import IO, List

from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
Expand Down
77 changes: 57 additions & 20 deletions libs/megaparse/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import asyncio
import platform
from pathlib import Path
from typing import IO

import pytest
import pytest_asyncio
from httpx import ASGITransport, AsyncClient
from langchain_community.document_loaders import PlaywrightURLLoader
Expand Down Expand Up @@ -41,30 +44,64 @@ async def aconvert(
**kwargs,
) -> str:
print("Fake parser is converting the file")
# Simulate some async work without actually blocking
await asyncio.sleep(0)
return "Fake conversion result"

return FakeParser()


@pytest.fixture(scope="session", autouse=True)
def event_loop():
"""Create an instance of the default event loop for each test case."""
if platform.system() == "Windows":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()


@pytest_asyncio.fixture(scope="function")
async def test_client():
print("Setting up test_client fixture")

def fake_parser_builder():
return FakeParserBuilder()

def fake_playwright_loader():
class FakePlaywrightLoader(PlaywrightURLLoader):
async def aload(self):
return [Document(page_content="Fake website content")]

return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"])

app.dependency_overrides[parser_builder_dep] = fake_parser_builder
app.dependency_overrides[get_playwright_loader] = fake_playwright_loader
async with AsyncClient(
transport=ASGITransport(app=app), # type: ignore
base_url="http://test",
) as ac:
yield ac
app.dependency_overrides = {}
"""Async test client fixture with proper resource cleanup and debugging."""
print("Setting up test_client fixture - initializing")
original_overrides = app.dependency_overrides.copy()
client = None

try:

def fake_parser_builder():
return FakeParserBuilder()

def fake_playwright_loader():
class FakePlaywrightLoader(PlaywrightURLLoader):
async def aload(self):
return [Document(page_content="Fake website content")]

return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"])

print("Setting up test_client fixture - configuring dependencies")
app.dependency_overrides[parser_builder_dep] = fake_parser_builder
app.dependency_overrides[get_playwright_loader] = fake_playwright_loader

print("Setting up test_client fixture - creating client")
client = AsyncClient(
transport=ASGITransport(app=app), # type: ignore
base_url="http://test",
)
await client.__aenter__()
print("Setting up test_client fixture - client ready")
yield client

except Exception as e:
print(f"Error in test_client fixture: {str(e)}")
raise
finally:
print("Cleaning up test_client fixture")
if client:
try:
await client.__aexit__(None, None, None)
except Exception as e:
print(f"Error during client cleanup: {str(e)}")
app.dependency_overrides = original_overrides
print("Test client cleanup complete")
Loading
Loading