diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 4c1b6b5..99aa724 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -84,6 +84,13 @@ jobs: run: | UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock + - name: 📥 Download NLTK Data + run: | + # Download required NLTK resources + rye run python libs/megaparse/scripts/download_nltk_data.py + - name: 🚀 Run tests run: | + # Run both API (megaparse) and SDK tests + rye test -p megaparse rye test -p megaparse-sdk diff --git a/libs/megaparse/pyproject.toml b/libs/megaparse/pyproject.toml index f4ff5bc..8300fce 100644 --- a/libs/megaparse/pyproject.toml +++ b/libs/megaparse/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'", "onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'", "pypdfium2>=4.30.0", + "pytest-xdist>=3.6.1", ] [project.optional-dependencies] @@ -54,9 +55,23 @@ build-backend = "hatchling.build" [tool.rye] managed = true -dev-dependencies = [] +dev-dependencies = [ + "pytest>=7.4.0", + "pytest-asyncio>=0.23.0", + "pytest-timeout>=2.1.0", + "pytest-xdist>=3.6.1", + "httpx>=0.24.1" +] universal = true +[tool.pytest.ini_options] +asyncio_mode = "strict" +asyncio_default_fixture_loop_scope = "function" +timeout = 30 +testpaths = ["tests"] +python_files = ["test_*.py"] +addopts = "-v --tb=short" + [tool.hatch.metadata] allow-direct-references = true diff --git a/libs/megaparse/scripts/download_nltk_data.py b/libs/megaparse/scripts/download_nltk_data.py new file mode 100644 index 0000000..081e59c --- /dev/null +++ b/libs/megaparse/scripts/download_nltk_data.py @@ -0,0 +1,23 @@ +"""Script to download required NLTK resources.""" + +import nltk + + +def download_nltk_resources(): + """Download required NLTK resources.""" + resources = [ + "punkt", + "punkt_tab", + "averaged_perceptron_tagger", + "averaged_perceptron_tagger_eng", + "maxent_ne_chunker", + "words", + "stopwords", + ] + for resource in resources: + print(f"Downloading {resource}...") + nltk.download(resource) + + +if __name__ == "__main__": + download_nltk_resources() diff --git a/libs/megaparse/src/megaparse/api/app.py b/libs/megaparse/src/megaparse/api/app.py index d6f0ae8..a3b6e03 100644 --- a/libs/megaparse/src/megaparse/api/app.py +++ b/libs/megaparse/src/megaparse/api/app.py @@ -1,7 +1,9 @@ +import asyncio import io import os import tempfile from typing import Optional +from urllib.parse import urlparse import httpx import psutil @@ -22,7 +24,7 @@ from megaparse.api.exceptions.megaparse_exceptions import ( HTTPDownloadError, HTTPFileNotFound, - HTTPModelNotSupported, + HTTPMemoryError, HTTPParsingException, ParsingException, ) @@ -67,28 +69,32 @@ async def parse_file( check_table: bool = Form(False), language: Language = Form(Language.ENGLISH), parsing_instruction: Optional[str] = Form(None), - model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O), + model_name: Optional[str] = Form(SupportedModel.GPT_4O.value), parser_builder=Depends(parser_builder_dep), ) -> dict[str, str]: if not _check_free_memory(): - raise HTTPException( - status_code=503, detail="Service unavailable due to low memory" - ) + raise HTTPMemoryError() model = None if model_name and check_table: - if model_name.startswith("gpt"): - model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore - elif model_name.startswith("claude"): + supported_models = [model.value for model in SupportedModel] + if model_name not in supported_models: + raise HTTPException( + status_code=501, + detail=f"Model {model_name} is not supported. Please use one of {supported_models}", + ) + model_name_str = model_name + if model_name_str.startswith("gpt"): + model = ChatOpenAI( + model=model_name_str, api_key=os.getenv("OPENAI_API_KEY") + ) # type: ignore + elif model_name_str.startswith("claude"): model = ChatAnthropic( - model_name=model_name, + model_name=model_name_str, api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore timeout=60, stop=None, ) - else: - raise HTTPModelNotSupported() - parser_config = ParseFileConfig( method=method, strategy=strategy, @@ -123,15 +129,51 @@ async def parse_file( async def upload_url( url: str, playwright_loader=Depends(get_playwright_loader) ) -> dict[str, str]: + if not _check_free_memory(): + raise HTTPMemoryError() + + # Validate URL format + result = urlparse(url) + if not all([result.scheme, result.netloc]): + raise HTTPException( + status_code=400, detail="Failed to load website content: Invalid URL format" + ) + playwright_loader.urls = [url] if url.endswith(".pdf"): - ## Download the file - - async with httpx.AsyncClient() as client: - response = await client.get(url) - if response.status_code != 200: - raise HTTPDownloadError(url) + ## Download the file with retry logic + max_retries = 3 + for attempt in range(max_retries): + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(url) + response.raise_for_status() + break + except ( + httpx.RequestError, + httpx.HTTPStatusError, + TimeoutError, + ConnectionError, + ) as e: + if isinstance(e, (httpx.TimeoutException, TimeoutError)): + if attempt == max_retries - 1: + raise HTTPException( + status_code=504, + detail=f"Request timed out after {max_retries} attempts", + ) + elif isinstance(e, ConnectionError): + if attempt == max_retries - 1: + raise HTTPException( + status_code=429, + detail=f"Failed after {max_retries} attempts: {str(e)}", + ) + elif attempt == max_retries - 1: + raise HTTPException( + status_code=400, + detail=f"Failed to load website content: {str(e)}", + ) + await asyncio.sleep(2**attempt) # Exponential backoff with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file: temp_file.write(response.content) @@ -141,23 +183,35 @@ async def upload_url( ) result = await megaparse.aload(temp_file.name) return {"message": "File parsed successfully", "result": result} - except ParsingException: - raise HTTPParsingException(url) + except ParsingException as e: + raise HTTPParsingException(url, message=str(e)) + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Internal server error while parsing PDF: {str(e)}", + ) else: - data = await playwright_loader.aload() - # Now turn the data into a string - extracted_content = "" - for page in data: - extracted_content += page.page_content - if not extracted_content: - raise HTTPDownloadError( - url, - message="Failed to extract content from the website. Valid URL example : https://www.quivr.com", + try: + data = await playwright_loader.aload() + # Now turn the data into a string + extracted_content = "" + for page in data: + extracted_content += page.page_content + if not extracted_content: + raise HTTPDownloadError( + url, + message="Failed to extract content from the website. Valid URL example : https://www.quivr.com", + ) + return { + "message": "Website content parsed successfully", + "result": extracted_content, + } + except Exception as e: + # Handle Playwright-specific errors + raise HTTPException( + status_code=400, + detail=f"Failed to load website content: {str(e)}. Make sure the URL is valid and accessible.", ) - return { - "message": "Website content parsed successfully", - "result": extracted_content, - } if __name__ == "__main__": diff --git a/libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py b/libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py index e343fd2..00f0c78 100644 --- a/libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py +++ b/libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py @@ -1,6 +1,37 @@ from fastapi import HTTPException +class HTTPTimeoutError(HTTPException): + def __init__( + self, + url: str, + message: str = "Request timed out", + headers: dict | None = None, + ): + detail = f"{url}: {message}" + super().__init__(status_code=504, detail=detail, headers=headers) + + +class HTTPMemoryError(HTTPException): + def __init__( + self, + message: str = "Service unavailable due to low memory", + headers: dict | None = None, + ): + super().__init__(status_code=503, detail=message, headers=headers) + + +class HTTPTooManyRequestsError(HTTPException): + def __init__( + self, + url: str, + message: str = "Too many retry attempts", + headers: dict | None = None, + ): + detail = f"{url}: {message}" + super().__init__(status_code=429, detail=detail, headers=headers) + + class HTTPModelNotSupported(HTTPException): def __init__( self, diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index 191edec..fbe3e77 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -1,4 +1,3 @@ -import asyncio import logging import os from pathlib import Path diff --git a/libs/megaparse/src/megaparse/parser/llama.py b/libs/megaparse/src/megaparse/parser/llama.py index 9cb0d8c..aae9058 100644 --- a/libs/megaparse/src/megaparse/parser/llama.py +++ b/libs/megaparse/src/megaparse/parser/llama.py @@ -1,4 +1,3 @@ -import asyncio from pathlib import Path from typing import IO, List diff --git a/libs/megaparse/src/megaparse/parser/megaparse_vision.py b/libs/megaparse/src/megaparse/parser/megaparse_vision.py index 0b05e73..ea7aee6 100644 --- a/libs/megaparse/src/megaparse/parser/megaparse_vision.py +++ b/libs/megaparse/src/megaparse/parser/megaparse_vision.py @@ -3,7 +3,7 @@ import re from io import BytesIO from pathlib import Path -from typing import IO, List, Union +from typing import IO, List from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage diff --git a/libs/megaparse/tests/conftest.py b/libs/megaparse/tests/conftest.py index e898f81..7cffec0 100644 --- a/libs/megaparse/tests/conftest.py +++ b/libs/megaparse/tests/conftest.py @@ -1,6 +1,9 @@ +import asyncio +import platform from pathlib import Path from typing import IO +import pytest import pytest_asyncio from httpx import ASGITransport, AsyncClient from langchain_community.document_loaders import PlaywrightURLLoader @@ -41,30 +44,64 @@ async def aconvert( **kwargs, ) -> str: print("Fake parser is converting the file") + # Simulate some async work without actually blocking + await asyncio.sleep(0) return "Fake conversion result" return FakeParser() +@pytest.fixture(scope="session", autouse=True) +def event_loop(): + """Create an instance of the default event loop for each test case.""" + if platform.system() == "Windows": + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + loop = asyncio.get_event_loop_policy().new_event_loop() + yield loop + loop.close() + + @pytest_asyncio.fixture(scope="function") async def test_client(): - print("Setting up test_client fixture") - - def fake_parser_builder(): - return FakeParserBuilder() - - def fake_playwright_loader(): - class FakePlaywrightLoader(PlaywrightURLLoader): - async def aload(self): - return [Document(page_content="Fake website content")] - - return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"]) - - app.dependency_overrides[parser_builder_dep] = fake_parser_builder - app.dependency_overrides[get_playwright_loader] = fake_playwright_loader - async with AsyncClient( - transport=ASGITransport(app=app), # type: ignore - base_url="http://test", - ) as ac: - yield ac - app.dependency_overrides = {} + """Async test client fixture with proper resource cleanup and debugging.""" + print("Setting up test_client fixture - initializing") + original_overrides = app.dependency_overrides.copy() + client = None + + try: + + def fake_parser_builder(): + return FakeParserBuilder() + + def fake_playwright_loader(): + class FakePlaywrightLoader(PlaywrightURLLoader): + async def aload(self): + return [Document(page_content="Fake website content")] + + return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"]) + + print("Setting up test_client fixture - configuring dependencies") + app.dependency_overrides[parser_builder_dep] = fake_parser_builder + app.dependency_overrides[get_playwright_loader] = fake_playwright_loader + + print("Setting up test_client fixture - creating client") + client = AsyncClient( + transport=ASGITransport(app=app), # type: ignore + base_url="http://test", + ) + await client.__aenter__() + print("Setting up test_client fixture - client ready") + yield client + + except Exception as e: + print(f"Error in test_client fixture: {str(e)}") + raise + finally: + print("Cleaning up test_client fixture") + if client: + try: + await client.__aexit__(None, None, None) + except Exception as e: + print(f"Error during client cleanup: {str(e)}") + app.dependency_overrides = original_overrides + print("Test client cleanup complete") diff --git a/libs/megaparse/tests/pdf/test_all_parsers.py b/libs/megaparse/tests/pdf/test_all_parsers.py index 9ac8204..3e80b91 100644 --- a/libs/megaparse/tests/pdf/test_all_parsers.py +++ b/libs/megaparse/tests/pdf/test_all_parsers.py @@ -1,8 +1,6 @@ import pytest from megaparse import MegaParse from megaparse.parser.doctr_parser import DoctrParser -from megaparse.parser.llama import LlamaParser -from megaparse.parser.megaparse_vision import MegaParseVision from megaparse.parser.unstructured_parser import UnstructuredParser PARSER_LIST = [ diff --git a/libs/megaparse/tests/test_endpoints.py b/libs/megaparse/tests/test_endpoints.py index 87a7e92..8f53ffb 100644 --- a/libs/megaparse/tests/test_endpoints.py +++ b/libs/megaparse/tests/test_endpoints.py @@ -1,7 +1,10 @@ +from unittest.mock import patch + import pytest @pytest.mark.asyncio +@pytest.mark.timeout(30) async def test_parse_file_endpoint(test_client): # Simulate a request to the parse endpoint with open("./tests/pdf/sample_pdf.pdf", "rb") as file: @@ -20,6 +23,63 @@ async def test_parse_file_endpoint(test_client): @pytest.mark.asyncio +@pytest.mark.timeout(30) +async def test_parse_file_missing(test_client): + """Test handling of missing file.""" + response = await test_client.post( + "/v1/file", + files={}, + data={ + "method": "unstructured", + "strategy": "auto", + "language": "en", + "check_table": False, + }, + ) + assert response.status_code == 422 # FastAPI validation error + + +@pytest.mark.asyncio +@pytest.mark.timeout(30) +async def test_parse_file_unsupported_model(test_client): + """Test handling of unsupported model.""" + with open("./tests/pdf/sample_pdf.pdf", "rb") as file: + response = await test_client.post( + "/v1/file", + files={"file": ("test.pdf", file)}, + data={ + "method": "unstructured", + "strategy": "auto", + "language": "en", + "check_table": True, + "model_name": "unsupported-model", + }, + ) + assert response.status_code == 501 + assert "not supported" in response.json()["detail"] + + +@pytest.mark.asyncio +@pytest.mark.timeout(30) +async def test_parse_file_memory_error(test_client): + """Test handling of memory limit exceeded.""" + with patch("megaparse.api.app._check_free_memory", return_value=False): + with open("./tests/pdf/sample_pdf.pdf", "rb") as file: + response = await test_client.post( + "/v1/file", + files={"file": ("test.pdf", file)}, + data={ + "method": "unstructured", + "strategy": "auto", + "language": "en", + }, + ) + assert response.status_code == 503 + assert "memory" in response.json()["detail"] + + +@pytest.mark.asyncio +@pytest.mark.timeout(30) async def test_parse_url_endpoint(test_client): response = await test_client.post("/v1/url?url=https://www.quivr.com") assert response.status_code == 200 @@ -27,3 +87,42 @@ async def test_parse_url_endpoint(test_client): "message": "Website content parsed successfully", "result": "Fake website content", } + + +@pytest.mark.asyncio +@pytest.mark.timeout(30) +async def test_parse_url_invalid(test_client): + """Test handling of invalid URL.""" + response = await test_client.post("/v1/url?url=invalid-url") + assert response.status_code == 400 + assert "Failed to load website content" in response.json()["detail"] + + +@pytest.mark.asyncio +@pytest.mark.timeout(30) +async def test_parse_url_timeout(test_client): + """Test handling of URL timeout.""" + with patch("httpx.AsyncClient.get", side_effect=TimeoutError): + response = await test_client.post("/v1/url?url=https://example.com/test.pdf") + assert response.status_code == 504 + assert "timed out" in response.json()["detail"] + + +@pytest.mark.asyncio +@pytest.mark.timeout(30) +async def test_parse_url_memory_error(test_client): + """Test handling of memory limit exceeded for URL parsing.""" + with patch("megaparse.api.app._check_free_memory", return_value=False): + response = await test_client.post("/v1/url?url=https://example.com") + assert response.status_code == 503 + assert "memory" in response.json()["detail"] + + +@pytest.mark.asyncio +@pytest.mark.timeout(30) +async def test_parse_url_too_many_retries(test_client): + """Test handling of too many retry attempts.""" + with patch("httpx.AsyncClient.get", side_effect=ConnectionError): + response = await test_client.post("/v1/url?url=https://example.com/test.pdf") + assert response.status_code == 429 + assert "Failed after" in response.json()["detail"] diff --git a/libs/megaparse/tests/test_parsers.py b/libs/megaparse/tests/test_parsers.py index ae081dd..8c38def 100644 --- a/libs/megaparse/tests/test_parsers.py +++ b/libs/megaparse/tests/test_parsers.py @@ -2,8 +2,8 @@ import pytest from megaparse.parser.doctr_parser import DoctrParser -from megaparse.parser.llama import LlamaParser -from megaparse.parser.megaparse_vision import MegaParseVision + +# Parsers will be imported when needed for specific tests from megaparse.parser.unstructured_parser import UnstructuredParser from megaparse_sdk.schema.extensions import FileExtension