From 42a1c683f5bd8da03e6b9c627fe2db44894fbe6a Mon Sep 17 00:00:00 2001 From: George Burton <8233643+gecBurton@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:32:39 +0100 Subject: [PATCH] Feature/remove-file-route (#1009) * remove unused code * remove unused tests * direct file control * check index exists first * removed http mocking * wip * delete unused files * formatting * tests passing * remove static * reinstated mocking unstructured * moved s3_client to conftest * core_file.uuid now random * core_file.uuid now random * remove file route * removed storage * remove unused code * fix tests * remove file status * removed user-uuid from meatdata * remove File * remove File from django * update core-api --------- Co-authored-by: George Burton --- core-api/core_api/app.py | 12 +- core-api/core_api/routes/file.py | 204 ------------------ core-api/tests/conftest.py | 64 +----- core-api/tests/routes/test_file.py | 148 ------------- core-api/tests/test_auth.py | 26 --- django_app/redbox_app/worker.py | 12 +- redbox-core/redbox/chains/ingest.py | 7 +- redbox-core/redbox/loader/base.py | 5 +- redbox-core/redbox/loader/ingester.py | 25 +-- redbox-core/redbox/loader/loaders.py | 6 +- redbox-core/redbox/models/__init__.py | 20 -- redbox-core/redbox/models/base.py | 22 -- redbox-core/redbox/models/errors.py | 35 --- redbox-core/redbox/models/file.py | 58 +---- redbox-core/redbox/models/persona.py | 7 - redbox-core/redbox/models/status.py | 7 - redbox-core/redbox/retriever/queries.py | 28 +-- redbox-core/redbox/storage/__init__.py | 7 - redbox-core/redbox/storage/elasticsearch.py | 167 -------------- redbox-core/redbox/storage/storage_handler.py | 62 ------ redbox-core/redbox/test/data.py | 14 +- redbox-core/tests/conftest.py | 54 +---- redbox-core/tests/graph/test_patterns.py | 4 +- redbox-core/tests/storage/__init__.py | 0 .../tests/storage/test_elasticsearch.py | 149 ------------- redbox-core/tests/test_ingest.py | 37 ++-- 26 files changed, 66 insertions(+), 1114 deletions(-) delete mode 100644 core-api/core_api/routes/file.py delete mode 100644 core-api/tests/routes/test_file.py delete mode 100644 core-api/tests/test_auth.py delete mode 100644 redbox-core/redbox/models/base.py delete mode 100644 redbox-core/redbox/models/persona.py delete mode 100644 redbox-core/redbox/models/status.py delete mode 100644 redbox-core/redbox/storage/__init__.py delete mode 100644 redbox-core/redbox/storage/elasticsearch.py delete mode 100644 redbox-core/redbox/storage/storage_handler.py delete mode 100644 redbox-core/tests/storage/__init__.py delete mode 100644 redbox-core/tests/storage/test_elasticsearch.py diff --git a/core-api/core_api/app.py b/core-api/core_api/app.py index 65a328a4b..51892b6fb 100644 --- a/core-api/core_api/app.py +++ b/core-api/core_api/app.py @@ -7,9 +7,10 @@ from fastapi.responses import RedirectResponse from core_api.routes.chat import chat_app -from core_api.routes.file import file_app from redbox import __version__ as redbox_version -from redbox.models import Settings, StatusResponse +from redbox.models import Settings +from pydantic import BaseModel + # === Logging === @@ -52,6 +53,12 @@ def root(): return RedirectResponse(url="/docs") +class StatusResponse(BaseModel): + status: str + uptime_seconds: float + version: str + + @app.get("/health", status_code=HTTPStatus.OK, tags=["health"]) def health(response: Response) -> StatusResponse: """Returns the health of the API @@ -79,4 +86,3 @@ def health(response: Response) -> StatusResponse: app.mount("/chat", chat_app) -app.mount("/file", file_app) diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py deleted file mode 100644 index 186282452..000000000 --- a/core-api/core_api/routes/file.py +++ /dev/null @@ -1,204 +0,0 @@ -import logging -from typing import Annotated -from uuid import UUID - -from elasticsearch import NotFoundError -from fastapi import Depends, FastAPI, UploadFile -from fastapi import File as FastAPIFile -from fastapi.responses import JSONResponse -from pydantic import BaseModel, Field - -from core_api.auth import get_user_uuid -from redbox.models import APIError404, File, ProcessingStatusEnum, Settings -from redbox.storage import ElasticsearchStorageHandler - -# === Functions === - - -def file_not_found_response(file_uuid: UUID) -> JSONResponse: - return JSONResponse( - status_code=404, - content={ - "detail": "Item not found", - "errors": { - "parameter": "file_uuid", - "detail": f"File {file_uuid} not found", - }, - }, - ) - - -# === Logging === - -logging.basicConfig(level=logging.INFO) -log = logging.getLogger() - -env = Settings() - - -# === Object Store === - -s3 = env.s3_client() - - -# === Storage === - -es = env.elasticsearch_client() -storage_handler = ElasticsearchStorageHandler(es_client=es, root_index=env.elastic_root_index) - - -file_app = FastAPI( - title="Core File API", - description="Redbox Core File API", - version="0.1.0", - openapi_tags=[ - {"name": "file", "description": "File endpoints"}, - ], - docs_url="/docs", - redoc_url="/redoc", - openapi_url="/openapi.json", -) - - -class FileRequest(BaseModel): - """Reference to file stored on s3""" - - key: str = Field(description="file key", examples=["policies.pdf"]) - - -@file_app.post("/", tags=["file"], status_code=201) -async def add_file(file_request: FileRequest, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> File: - """Create a File record in the database - - Args: - file_request (FileRequest): The file to be recorded - user_uuid (UUID): The UUID of the user - - Returns: - File: The file uuid from the elastic database - """ - - file = File( - key=file_request.key, - bucket=env.bucket_name, - creator_user_uuid=user_uuid, - ingest_status=ProcessingStatusEnum.processing, - ) - - storage_handler.write_item(file) - - return file - - -@file_app.get("/", tags=["file"]) -async def list_files(user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> list[File]: - """Gets a list of files in the database. - - Args: - user_uuid (UUID): The UUID of the user - - Returns: - Files (list, File): A list of file objects - """ - return storage_handler.read_all_items(model_type="File", user_uuid=user_uuid) - - -# Standard file upload endpoint for utility in quick testing -if env.dev_mode: - - @file_app.post("/upload", tags=["file"], response_model=File) - async def upload_file(user_uuid: Annotated[UUID, Depends(get_user_uuid)], file: UploadFile = None) -> File: - """Upload a file to the object store - - Args: - file (UploadFile): The file to upload - - Returns: - File: The file that was uploaded - """ - file = file or FastAPIFile(...) - key = file.filename - s3.upload_fileobj(file.file, env.bucket_name, key) - - file = File( - key=key, bucket=env.bucket_name, creator_user_uuid=user_uuid, ingest_status=ProcessingStatusEnum.processing - ) - storage_handler.write_item(file) - - return file - - -@file_app.get( - "/{file_uuid}", - response_model=File, - tags=["file"], - responses={404: {"model": APIError404, "description": "The file was not found"}}, -) -def get_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> File: - """Get a file from the object store - - Args: - file_uuid (UUID): The UUID of the file to get - user_uuid (UUID): The UUID of the user - - Returns: - File: The file - - Raises: - 404: If the file isn't found, or the creator and requester don't match - """ - try: - file = storage_handler.read_item(file_uuid, model_type="File") - except NotFoundError: - return file_not_found_response(file_uuid=file_uuid) - - if file.creator_user_uuid != user_uuid: - return file_not_found_response(file_uuid=file_uuid) - - return file - - -@file_app.delete( - "/{file_uuid}", - response_model=File, - tags=["file"], - responses={404: {"model": APIError404, "description": "The file was not found"}}, -) -def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> File: - """Delete a file from the object store and the database - - Args: - file_uuid (UUID): The UUID of the file to delete - user_uuid (UUID): The UUID of the user - - Returns: - File: The file that was deleted - - Raises: - 404: If the file isn't found, or the creator and requester don't match - """ - try: - file = storage_handler.read_item(file_uuid, model_type="File") - except NotFoundError: - return file_not_found_response(file_uuid=file_uuid) - - if file.creator_user_uuid != user_uuid: - return file_not_found_response(file_uuid=file_uuid) - - storage_handler.delete_item(file) - - storage_handler.delete_user_items( - model_type="chunk", - user_uuid=user_uuid, - filters=[ - { - "bool": { - "should": [ - {"term": {"file_name.keyword": file.key}}, - {"term": {"metadata.file_name.keyword": file.key}}, - ] - } - } - ], - ) - return file diff --git a/core-api/tests/conftest.py b/core-api/tests/conftest.py index d96e4acaf..e8531a63c 100644 --- a/core-api/tests/conftest.py +++ b/core-api/tests/conftest.py @@ -10,9 +10,8 @@ from langchain_core.documents.base import Document from langchain_core.embeddings.fake import FakeEmbeddings from langchain_elasticsearch.vectorstores import ElasticsearchStore -from redbox.models import File, Settings +from redbox.models import Settings from redbox.models.file import ChunkMetadata, ChunkResolution -from redbox.storage import ElasticsearchStorageHandler from core_api import dependencies from core_api.app import app as application @@ -33,11 +32,6 @@ def es_client(env: Settings) -> Elasticsearch: return env.elasticsearch_client() -@pytest.fixture() -def es_storage_handler(es_client: Elasticsearch, env: Settings) -> ElasticsearchStorageHandler: - return ElasticsearchStorageHandler(es_client=es_client, root_index=env.elastic_root_index) - - @pytest.fixture(scope="session") def es_index(env: Settings) -> str: return f"{env.elastic_root_index}-chunk" @@ -115,23 +109,21 @@ def file_pdf_path() -> Path: @pytest.fixture() -def file_pdf_object(file_pdf_path: Path, alice: UUID, env: Settings) -> File: +def file_pdf(file_pdf_path: Path, alice: UUID, env: Settings) -> str: """The unuploaded File object of Alice's PDF.""" file_name = file_pdf_path.name - return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=alice) + return file_name @pytest.fixture() -def file_pdf_chunks(file_pdf_object: File) -> list[Document]: +def file_pdf_chunks(file_pdf) -> list[Document]: """The Document chunk objects of Alice's PDF.""" normal_chunks = [ Document( page_content="hello", metadata=ChunkMetadata( - parent_file_uuid=str(file_pdf_object.uuid), index=i, - file_name=file_pdf_object.key, - creator_user_uuid=file_pdf_object.creator_user_uuid, + file_name=file_pdf, page_number=4, created_datetime=datetime.now(UTC), token_count=4, @@ -145,10 +137,8 @@ def file_pdf_chunks(file_pdf_object: File) -> list[Document]: Document( page_content="hello" * 10, metadata=ChunkMetadata( - parent_file_uuid=str(file_pdf_object.uuid), index=i, - file_name=file_pdf_object.key, - creator_user_uuid=file_pdf_object.creator_user_uuid, + file_name=file_pdf, page_number=4, created_datetime=datetime.now(UTC), token_count=20, @@ -160,20 +150,6 @@ def file_pdf_chunks(file_pdf_object: File) -> list[Document]: return normal_chunks + large_chunks -@pytest.fixture() -def file_pdf( - es_store: ElasticsearchStore, - es_storage_handler: ElasticsearchStorageHandler, - file_pdf_object: File, - file_pdf_chunks: list[Document], -) -> File: - """The File object of Alice's PDF, with all objects in the Elasticsearch index.""" - es_storage_handler.write_item(file_pdf_object) - es_storage_handler.refresh() - es_store.add_documents(file_pdf_chunks) - return file_pdf_object - - @pytest.fixture() def file_html_path() -> Path: """The path of Alice's HTML.""" @@ -181,23 +157,21 @@ def file_html_path() -> Path: @pytest.fixture() -def file_html_object(file_html_path: Path, alice: UUID, env: Settings) -> File: +def file_html(file_html_path: Path, alice: UUID, env: Settings) -> str: """The unuploaded File object of Alice's HTML.""" file_name = file_html_path.name - return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=alice) + return file_name @pytest.fixture() -def file_html_chunks(file_html_object: File) -> list[Document]: +def file_html_chunks(file_html: str) -> list[Document]: """The Document chunk objects of Alice's HTML.""" normal_chunks = [ Document( page_content="hello", metadata=ChunkMetadata( - parent_file_uuid=str(file_html_object.uuid), index=i, - file_name=file_html_object.key, - creator_user_uuid=file_html_object.creator_user_uuid, + file_name=file_html, page_number=4, created_datetime=datetime.now(UTC), token_count=4, @@ -211,10 +185,8 @@ def file_html_chunks(file_html_object: File) -> list[Document]: Document( page_content="hello" * 10, metadata=ChunkMetadata( - parent_file_uuid=str(file_html_object.uuid), index=i, - file_name=file_html_object.key, - creator_user_uuid=file_html_object.creator_user_uuid, + file_name=file_html, page_number=4, created_datetime=datetime.now(UTC), token_count=20, @@ -224,17 +196,3 @@ def file_html_chunks(file_html_object: File) -> list[Document]: for i in range(2) ] return normal_chunks + large_chunks - - -@pytest.fixture() -def file_html( - es_store: ElasticsearchStore, - es_storage_handler: ElasticsearchStorageHandler, - file_html_object: File, - file_html_chunks: list[Document], -) -> File: - """The File object of Alice's HTML, with all objects in the Elasticsearch index.""" - es_storage_handler.write_item(file_html_object) - es_storage_handler.refresh() - es_store.add_documents(file_html_chunks) - return file_html_object diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py deleted file mode 100644 index 5170b5533..000000000 --- a/core-api/tests/routes/test_file.py +++ /dev/null @@ -1,148 +0,0 @@ -import json -from http import HTTPStatus -from pathlib import Path - -import pytest -from elasticsearch import NotFoundError -from fastapi.testclient import TestClient -from redbox.models import File -from redbox.storage import ElasticsearchStorageHandler - -from core_api.routes.file import env - - -@pytest.mark.asyncio() -async def test_post_file_upload(app_client: TestClient, file_pdf_path: Path, headers: dict[str, str]): - """ - Given a new file - When I POST it to /file - I Expect to see it persisted in elastic-search - """ - - file_key = file_pdf_path.name - - response = app_client.post( - "/file", - json={ - "key": file_key, - "bucket": env.bucket_name, - }, - headers=headers, - ) - assert response.status_code == HTTPStatus.CREATED - - file = json.loads(response.content.decode("utf-8")) - assert file["ingest_status"] == "processing" - - -def test_list_files(app_client: TestClient, file_pdf: File, headers: dict[str, str]): - """ - Given a previously saved file - When I GET all files from /file - I Expect the response to contain this file - """ - response = app_client.get("/file", headers=headers) - assert response.status_code == HTTPStatus.OK - - file_list = json.loads(response.content.decode("utf-8")) - assert len(file_list) > 0 - - assert str(file_pdf.uuid) in [file["uuid"] for file in file_list] - - -def test_get_file(app_client: TestClient, file_pdf: File, headers: dict[str, str]): - """ - Given a previously saved file - When I GET it from /file/uuid - I Expect to receive it - """ - response = app_client.get(f"/file/{file_pdf.uuid}", headers=headers) - assert response.status_code == HTTPStatus.OK - - -def test_get_missing_file(app_client: TestClient, headers: dict[str, str]): - """ - Given a nonexistent file - When I GET it from /file/uuid - I Expect to receive a 404 error - """ - response = app_client.get("/file/ffffffff-ffff-ffff-ffff-ffffffffffff", headers=headers) - assert response.status_code == HTTPStatus.NOT_FOUND - - -def test_delete_file( - app_client: TestClient, - es_storage_handler: ElasticsearchStorageHandler, - file_pdf: File, - file_html: File, - headers: dict[str, str], -): - """ - Given a previously saved file - When I DELETE it to /file - I Expect to see it removed from s3 and elastic-search, including the chunks - """ - - def make_chunk_file_filter(s3_key: str) -> list[dict]: - return [ - { - "bool": { - "should": [ - {"term": {"file_name.keyword": s3_key}}, - {"term": {"metadata.file_name.keyword": s3_key}}, - ] - } - } - ] - - # Check assets exist - assert es_storage_handler.read_item(item_uuid=file_pdf.uuid, model_type="file") - assert es_storage_handler.list_all_items( - model_type="chunk", user_uuid=file_pdf.creator_user_uuid, filters=make_chunk_file_filter(file_pdf.key) - ) - assert es_storage_handler.read_item(item_uuid=file_html.uuid, model_type="file") - assert es_storage_handler.list_all_items( - model_type="chunk", user_uuid=file_html.creator_user_uuid, filters=make_chunk_file_filter(file_html.key) - ) - - # Delete the PDF - response = app_client.delete(f"/file/{file_pdf.uuid}", headers=headers) - assert response.status_code == HTTPStatus.OK - - es_storage_handler.refresh() - - # Check the PDF doesn't exist - - with pytest.raises(NotFoundError): - es_storage_handler.read_item(item_uuid=file_pdf.uuid, model_type="file") - - assert not es_storage_handler.list_all_items( - model_type="chunk", user_uuid=file_pdf.creator_user_uuid, filters=make_chunk_file_filter(file_pdf.key) - ) - - # Check the HTML still exists - - assert es_storage_handler.read_item(item_uuid=file_html.uuid, model_type="file") - assert es_storage_handler.list_all_items( - model_type="chunk", user_uuid=file_html.creator_user_uuid, filters=make_chunk_file_filter(file_html.key) - ) - - -def test_delete_missing_file(app_client: TestClient, headers: dict[str, str]): - """ - Given a nonexistent file - When I DELETE it to /file - I Expect to receive a 404 error - """ - response = app_client.delete("/file/ffffffff-ffff-ffff-ffff-ffffffffffff", headers=headers) - assert response.status_code == HTTPStatus.NOT_FOUND - - -def test_get_missing_file_chunks(app_client: TestClient, headers: dict[str, str]): - """ - Given a nonexistent file - When I GET it from /file/uuid/chunks - I Expect to receive a 404 error - """ - response = app_client.get("/file/ffffffff-ffff-ffff-ffff-ffffffffffff/chunks", headers=headers) - assert response.status_code == HTTPStatus.NOT_FOUND diff --git a/core-api/tests/test_auth.py b/core-api/tests/test_auth.py deleted file mode 100644 index 3789017c2..000000000 --- a/core-api/tests/test_auth.py +++ /dev/null @@ -1,26 +0,0 @@ -from uuid import uuid4 - -import pytest -from fastapi.testclient import TestClient -from jose import jwt -from redbox.models import File - - -@pytest.mark.parametrize( - ("malformed_headers", "status_code"), - [ - (None, 403), - ({"Authorization": "blah blah"}, 403), - ({"Authorization": "Bearer blah-blah"}, 401), - ({"Authorization": "Bearer " + jwt.encode({"user_uuid": "not a uuid"}, key="super-secure-private-key")}, 401), - ({"Authorization": "Bearer " + jwt.encode({"user_uuid": str(uuid4())}, key="super-secure-private-key")}, 404), - ], -) -def test_get_file_fails_auth(app_client: TestClient, file_pdf: File, malformed_headers: dict | None, status_code: int): - """ - Given a previously saved file - When I GET it from /file/uuid with a missing/broken/correct header - I Expect get an appropriate status_code - """ - response = app_client.get(f"/file/{file_pdf.uuid}", headers=malformed_headers) - assert response.status_code == status_code diff --git a/django_app/redbox_app/worker.py b/django_app/redbox_app/worker.py index 30c75c565..83eb64177 100644 --- a/django_app/redbox_app/worker.py +++ b/django_app/redbox_app/worker.py @@ -1,11 +1,7 @@ import logging from uuid import UUID -from django.conf import settings - from redbox.loader.ingester import ingest_file -from redbox.models import File as CoreFile -from redbox.models import ProcessingStatusEnum def ingest(file_id: UUID): @@ -16,13 +12,7 @@ def ingest(file_id: UUID): logging.info("Ingesting file: %s", file) - core_file = CoreFile( - key=file.unique_name, - bucket=settings.BUCKET_NAME, - creator_user_uuid=file.user.id, - ) - core_file.ingest_status = ProcessingStatusEnum.embedding - if error := ingest_file(core_file): + if error := ingest_file(file.unique_name): file.status = StatusEnum.errored file.ingest_error = error else: diff --git a/redbox-core/redbox/chains/ingest.py b/redbox-core/redbox/chains/ingest.py index c06cd83b1..d3afab406 100644 --- a/redbox-core/redbox/chains/ingest.py +++ b/redbox-core/redbox/chains/ingest.py @@ -8,7 +8,6 @@ from langchain_core.runnables import RunnableLambda, chain, Runnable from redbox.models.settings import Settings -from redbox.models.file import File from redbox.loader.base import BaseRedboxFileLoader @@ -29,9 +28,9 @@ def log_chunks(chunks: list[Document]): def document_loader(document_loader_type: type[BaseRedboxFileLoader], s3_client: S3Client, env: Settings) -> Runnable: @chain - def wrapped(file: File): - file_bytes = s3_client.get_object(Bucket=file.bucket, Key=file.key)["Body"].read() - return document_loader_type(file=file, file_bytes=BytesIO(file_bytes), env=env).lazy_load() + def wrapped(file_name: str): + file_bytes = s3_client.get_object(Bucket=env.bucket_name, Key=file_name)["Body"].read() + return document_loader_type(file_name=file_name, file_bytes=BytesIO(file_bytes), env=env).lazy_load() return wrapped diff --git a/redbox-core/redbox/loader/base.py b/redbox-core/redbox/loader/base.py index e9c53f103..9194ffd00 100644 --- a/redbox-core/redbox/loader/base.py +++ b/redbox-core/redbox/loader/base.py @@ -2,17 +2,16 @@ from langchain_core.document_loaders import BaseLoader -from redbox.models.file import File from redbox.models.settings import Settings class BaseRedboxFileLoader(BaseLoader): - def __init__(self, file: File, file_bytes: BytesIO, env: Settings) -> None: + def __init__(self, file_name: str, file_bytes: BytesIO, env: Settings) -> None: """Initialize the loader with a file path. Args: file: The Redbox File to load """ - self.file = file + self.file_name = file_name self.file_bytes = file_bytes self.env = env diff --git a/redbox-core/redbox/loader/ingester.py b/redbox-core/redbox/loader/ingester.py index ad1cd5b33..2acc478f8 100644 --- a/redbox-core/redbox/loader/ingester.py +++ b/redbox-core/redbox/loader/ingester.py @@ -7,9 +7,7 @@ from redbox.chains.components import get_embeddings from redbox.chains.ingest import ingest_from_loader from redbox.loader import UnstructuredLargeChunkLoader, UnstructuredTitleLoader -from redbox.models import File -from redbox.models import ProcessingStatusEnum, Settings -from redbox.storage.elasticsearch import ElasticsearchStorageHandler +from redbox.models import Settings if TYPE_CHECKING: from mypy_boto3_s3.client import S3Client @@ -36,19 +34,13 @@ def get_elasticsearch_store_without_embeddings(es, es_index_name: str): return ElasticsearchStore(index_name=es_index_name, es_connection=es, query_field="text", strategy=BM25Strategy()) -def get_elasticsearch_storage_handler(es): - return ElasticsearchStorageHandler(es, env.elastic_root_index) +def ingest_file(file_name: str) -> str | None: + logging.info("Ingesting file: %s", file_name) - -def ingest_file(core_file: File) -> str | None: - logging.info("Ingesting file: %s", core_file) - - core_file.ingest_status = ProcessingStatusEnum.embedding es = env.elasticsearch_client() es_index_name = f"{env.elastic_root_index}-chunk" es.indices.create(index=es_index_name, ignore=[400]) - storage_handler = get_elasticsearch_storage_handler(es) chunk_ingest_chain = ingest_from_loader( document_loader_type=UnstructuredTitleLoader, @@ -66,14 +58,9 @@ def ingest_file(core_file: File) -> str | None: try: new_ids = RunnableParallel({"normal": chunk_ingest_chain, "largest": large_chunk_ingest_chain}).invoke( - core_file + file_name ) - core_file.ingest_status = ProcessingStatusEnum.complete - logging.info("File: %s %s chunks ingested", core_file, {k: len(v) for k, v in new_ids.items()}) + logging.info("File: %s %s chunks ingested", file_name, {k: len(v) for k, v in new_ids.items()}) except Exception as e: - logging.exception("Error while processing file [%s]", core_file) - core_file.ingest_status = ProcessingStatusEnum.failed + logging.exception("Error while processing file [%s]", file_name) return f"{type(e)}: {e.args[0]}" - - finally: - storage_handler.update_item(core_file) diff --git a/redbox-core/redbox/loader/loaders.py b/redbox-core/redbox/loader/loaders.py index 32ce9fcac..27d4fb0a8 100644 --- a/redbox-core/redbox/loader/loaders.py +++ b/redbox-core/redbox/loader/loaders.py @@ -29,7 +29,7 @@ def lazy_load(self) -> Iterator[Document]: # <-- Does not take any arguments url = f"http://{self.env.unstructured_host}:8000/general/v0/general" files = { - "files": (self.file.key, self.file_bytes), + "files": (self.file_name, self.file_bytes), } response = requests.post( url, @@ -54,7 +54,6 @@ def lazy_load(self) -> Iterator[Document]: # <-- Does not take any arguments yield Document( page_content=raw_chunk["text"], metadata=ChunkMetadata( - creator_user_uuid=self.file.creator_user_uuid, index=i, file_name=raw_chunk["metadata"].get("filename"), page_number=raw_chunk["metadata"].get("page_number"), @@ -78,7 +77,7 @@ def lazy_load(self) -> Iterator[Document]: # <-- Does not take any arguments url = f"http://{self.env.unstructured_host}:8000/general/v0/general" files = { - "files": (self.file.key, self.file_bytes), + "files": (self.file_name, self.file_bytes), } response = requests.post( url, @@ -103,7 +102,6 @@ def lazy_load(self) -> Iterator[Document]: # <-- Does not take any arguments yield Document( page_content=raw_chunk["text"], metadata=ChunkMetadata( - creator_user_uuid=self.file.creator_user_uuid, index=i, file_name=raw_chunk["metadata"].get("filename"), page_number=raw_chunk["metadata"].get("page_number"), diff --git a/redbox-core/redbox/models/__init__.py b/redbox-core/redbox/models/__init__.py index 8d1734de0..f8fd04cba 100644 --- a/redbox-core/redbox/models/__init__.py +++ b/redbox-core/redbox/models/__init__.py @@ -1,30 +1,10 @@ from redbox.models.chat import ChatMessage, ChatRequest, ChatResponse, ChatRoute -from redbox.models.errors import ( - APIError404, - APIErrorDetail, - APIErrorResponse, -) -from redbox.models.file import ( - File, - FileStatus, - ProcessingStatusEnum, -) -from redbox.models.persona import ChatPersona from redbox.models.settings import Settings -from redbox.models.status import StatusResponse __all__ = [ - "APIError404", - "APIErrorDetail", - "APIErrorResponse", "ChatMessage", - "ChatPersona", "ChatRequest", "ChatResponse", "ChatRoute", - "File", - "FileStatus", - "ProcessingStatusEnum", "Settings", - "StatusResponse", ] diff --git a/redbox-core/redbox/models/base.py b/redbox-core/redbox/models/base.py deleted file mode 100644 index d7ded6ac0..000000000 --- a/redbox-core/redbox/models/base.py +++ /dev/null @@ -1,22 +0,0 @@ -from datetime import datetime -from uuid import UUID, uuid4 - -from pydantic import BaseModel, Field, computed_field - - -class PersistableModel(BaseModel): - """Base class for all models that can be persisted to the database.""" - - uuid: UUID = Field(default_factory=uuid4) - created_datetime: datetime = Field(default_factory=datetime.utcnow) - creator_user_uuid: UUID - - @computed_field # type: ignore[misc] # Remove if https://github.com/python/mypy/issues/1362 is fixed. - @property # Needed for type checking - see https://docs.pydantic.dev/2.0/usage/computed_fields/ - def model_type(self) -> str: - """Return the name of the model class. - - Returns: - str: The name of the model class. - """ - return self.__class__.__name__ diff --git a/redbox-core/redbox/models/errors.py b/redbox-core/redbox/models/errors.py index 3ea1df21f..976eec774 100644 --- a/redbox-core/redbox/models/errors.py +++ b/redbox-core/redbox/models/errors.py @@ -1,38 +1,3 @@ -from pydantic import AnyUrl, BaseModel, Field - - -class APIErrorDetail(BaseModel): - """A single API error loosely defined to the RFC 9457 format.""" - - parameter: str = Field(description="The name of the query or path parameter that is the source of error.") - detail: str = Field( - description=( - "A granular description on the specific error related to a body property, " - "query parameter, path parameters, and/or header." - ) - ) - - -class APIErrorResponse(BaseModel): - """An API error object loosely defined to the RFC 9457 format.""" - - type: AnyUrl = Field(description="A URI reference that identifies the problem type.") - status: int = Field( - description="The HTTP status code generated by the origin server for this occurrence of the problem." - ) - title: str = Field(description="A short, human-readable summary of the problem type.") - detail: str = Field(description="A human-readable explanation specific to this occurrence of the problem.") - errors: list[APIErrorDetail] = Field( - description="An array of error details to accompany a problem details response." - ) - - -class APIError404(APIErrorResponse): - type: AnyUrl = AnyUrl("http://example.com/error/not-found") - status: int = 404 - title: str = "File not found" - - class AIError(Exception): """Basic error class for problems with the AI components.""" diff --git a/redbox-core/redbox/models/file.py b/redbox-core/redbox/models/file.py index a4773a705..1bc80f7cf 100644 --- a/redbox-core/redbox/models/file.py +++ b/redbox-core/redbox/models/file.py @@ -1,66 +1,11 @@ from __future__ import annotations -from enum import Enum, StrEnum +from enum import StrEnum from uuid import UUID, uuid4 import datetime -import tiktoken from pydantic import BaseModel, Field -from redbox.models.base import PersistableModel - -encoding = tiktoken.get_encoding("cl100k_base") - - -class ProcessingStatusEnum(str, Enum): - """Current status of the file processing. - - Note: The Django app interprets these as: - "processing" -> "processing" - "complete" -> "complete" - "failed" -> "errored" - anything else -> "processing" - - If you add any other fail state options, the Django app will need to be updated. - django_app/redbox_app/redbox_core/models.py File.update_status_from_core - """ - - processing = "processing" - embedding = "embedding" # Used for processing while we transition to new statuses - failed = "failed" - complete = "complete" - - -class File(PersistableModel): - """Reference to file stored on s3""" - - key: str = Field(description="file key") - bucket: str = Field(description="s3 bucket") - ingest_status: ProcessingStatusEnum | None = Field( - description="Status of file ingest for files loaded by new worker", default=None - ) - - -class Link(BaseModel): - text: str | None - url: str - start_index: int - - def __le__(self, other: Link): - """required for sorted""" - return self.start_index <= other.start_index - - def __hash__(self): - return hash(self.text) ^ hash(self.url) ^ hash(self.start_index) - - -class FileStatus(BaseModel): - """Status of a file.""" - - file_uuid: UUID - processing_status: ProcessingStatusEnum | None - chunk_statuses: None = Field(default=None, description="deprecated, see processing_status") - class ChunkResolution(StrEnum): smallest = "smallest" @@ -77,7 +22,6 @@ class ChunkMetadata(BaseModel): """ uuid: UUID = Field(default_factory=uuid4) - creator_user_uuid: UUID index: int file_name: str page_number: int | None = None diff --git a/redbox-core/redbox/models/persona.py b/redbox-core/redbox/models/persona.py deleted file mode 100644 index c118db539..000000000 --- a/redbox-core/redbox/models/persona.py +++ /dev/null @@ -1,7 +0,0 @@ -from redbox.models.base import PersistableModel - - -class ChatPersona(PersistableModel): - name: str | None - description: str | None - prompt: str | None diff --git a/redbox-core/redbox/models/status.py b/redbox-core/redbox/models/status.py deleted file mode 100644 index bac214a25..000000000 --- a/redbox-core/redbox/models/status.py +++ /dev/null @@ -1,7 +0,0 @@ -from pydantic import BaseModel - - -class StatusResponse(BaseModel): - status: str - uptime_seconds: float - version: str diff --git a/redbox-core/redbox/retriever/queries.py b/redbox-core/redbox/retriever/queries.py index 6c59e8d21..6b6e2fac2 100644 --- a/redbox-core/redbox/retriever/queries.py +++ b/redbox-core/redbox/retriever/queries.py @@ -1,5 +1,4 @@ from typing import Any -from uuid import UUID from langchain_core.embeddings.embeddings import Embeddings @@ -7,30 +6,21 @@ from redbox.models.file import ChunkResolution -def make_query_filter(user_uuid: UUID, file_names: list[str], chunk_resolution: ChunkResolution | None) -> list[dict]: - query_filter: list[dict] = [ +def make_query_filter(file_names: list[str], chunk_resolution: ChunkResolution | None) -> list[dict]: + if not file_names: + return [] + + query_filter = [ { "bool": { "should": [ - {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, - {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, + {"terms": {"file_name.keyword": file_names}}, + {"terms": {"metadata.file_name.keyword": file_names}}, ] } } ] - if len(file_names) != 0: - query_filter.append( - { - "bool": { - "should": [ - {"terms": {"file_name.keyword": file_names}}, - {"terms": {"metadata.file_name.keyword": file_names}}, - ] - } - } - ) - if chunk_resolution: query_filter.append( { @@ -54,7 +44,7 @@ def get_all( As it's used in summarisation, it excludes embeddings. """ - query_filter = make_query_filter(state["request"].user_uuid, state["request"].s3_keys, chunk_resolution) + query_filter = make_query_filter(state["request"].s3_keys, chunk_resolution) return { "_source": {"excludes": ["*embedding"]}, "query": {"bool": {"must": {"match_all": {}}, "filter": query_filter}}, @@ -69,7 +59,7 @@ def get_some( ) -> dict[str, Any]: vector = embedding_model.embed_query(state["request"].question) - query_filter = make_query_filter(state["request"].user_uuid, state["request"].s3_keys, chunk_resolution) + query_filter = make_query_filter(state["request"].s3_keys, chunk_resolution) return { "size": state["request"].ai_settings.rag_k, diff --git a/redbox-core/redbox/storage/__init__.py b/redbox-core/redbox/storage/__init__.py deleted file mode 100644 index 24a2f77aa..000000000 --- a/redbox-core/redbox/storage/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from redbox.storage.elasticsearch import ElasticsearchStorageHandler -from redbox.storage.storage_handler import BaseStorageHandler - -__all__ = [ - "BaseStorageHandler", - "ElasticsearchStorageHandler", -] diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py deleted file mode 100644 index f6e2fb3eb..000000000 --- a/redbox-core/redbox/storage/elasticsearch.py +++ /dev/null @@ -1,167 +0,0 @@ -import logging -import os -from collections.abc import Sequence -from uuid import UUID - -from elastic_transport import ObjectApiResponse -from elasticsearch import Elasticsearch, NotFoundError -from elasticsearch.helpers import scan -from pydantic import ValidationError - -from redbox.models.base import PersistableModel -from redbox.storage.storage_handler import BaseStorageHandler - -logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO")) -log = logging.getLogger() - - -class ElasticsearchStorageHandler(BaseStorageHandler): - """Storage Handler for Elasticsearch""" - - def __init__( - self, - es_client: Elasticsearch, - root_index: str, - ): - """Initialise the storage handler - - Args: - es_client (Elasticsearch): Elasticsearch client - root_index (str, optional): Root index to use. Defaults to "redbox". - """ - self.es_client = es_client - self.root_index = root_index - - def refresh(self, index: str = "*") -> ObjectApiResponse: - return self.es_client.indices.refresh(index=f"{self.root_index}-{index}") - - def write_item(self, item: PersistableModel) -> ObjectApiResponse: - target_index = f"{self.root_index}-{item.model_type.lower()}" - - return self.es_client.index( - index=target_index, - id=str(item.uuid), - body=item.model_dump(mode="json"), - ) - - def write_items(self, items: Sequence[PersistableModel]) -> Sequence[ObjectApiResponse]: - return list(map(self.write_item, items)) - - def read_item(self, item_uuid: UUID, model_type: str): - target_index = f"{self.root_index}-{model_type.lower()}" - result = self.es_client.get(index=target_index, id=str(item_uuid)) - model = self.get_model_by_model_type(model_type) - return model(**result.body["_source"]) - - def read_items(self, item_uuids: list[UUID], model_type: str): - target_index = f"{self.root_index}-{model_type.lower()}" - result = self.es_client.mget(index=target_index, body={"ids": list(map(str, item_uuids))}) - - model = self.get_model_by_model_type(model_type) - return [model(**item["_source"]) for item in result.body["docs"]] - - def update_item(self, item: PersistableModel) -> ObjectApiResponse: - target_index = f"{self.root_index}-{item.model_type.lower()}" - - return self.es_client.index( - index=target_index, - id=str(item.uuid), - body=item.model_dump(mode="json"), - ) - - def update_items(self, items: list[PersistableModel]) -> list[ObjectApiResponse]: - return list(map(self.update_item, items)) - - def delete_item(self, item: PersistableModel) -> ObjectApiResponse: - target_index = f"{self.root_index}-{item.model_type.lower()}" - return self.es_client.delete(index=target_index, id=str(item.uuid)) - - def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | None: - if not items: - return None - - if len({item.model_type for item in items}) > 1: - message = "Items with differing model types: {item.model_type for item in items}" - raise ValueError(message) - model_type = items[0].model_type - target_index = f"{self.root_index}-{model_type.lower()}" - return self.es_client.delete_by_query( - index=target_index, - body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}}, - ) - - def delete_user_items( - self, model_type: str, user_uuid: UUID, filters: list[dict] = None - ) -> ObjectApiResponse | None: - target_index = f"{self.root_index}-{model_type.lower()}" - return self.es_client.delete_by_query( - index=target_index, - body=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid, filters), - ) - - def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableModel]: - target_index = f"{self.root_index}-{model_type.lower()}" - try: - result = scan( - client=self.es_client, - index=target_index, - query=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid), - _source=True, - ) - - except NotFoundError: - log.info("Index %s not found. Returning empty list.", target_index) - return [] - - # Grab the model we'll use to deserialize the items - model = self.get_model_by_model_type(model_type) - try: - results = list(result) - except NotFoundError: - return [] - - items = [] - for item in results: - try: - items.append(model(**item["_source"])) - except ValidationError as e: - log.exception("Validation exception for %s", item, exc_info=e) - return items - - def list_all_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = None) -> list[UUID]: - target_index = f"{self.root_index}-{model_type.lower()}" - try: - # Only return _id - results = scan( - client=self.es_client, - index=target_index, - query=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid, filters), - _source=False, - ) - - except NotFoundError: - log.info("Index %s not found. Returning empty list.", target_index) - return [] - return [UUID(item["_id"]) for item in results] - - @classmethod - def get_query_match_all_for_user(cls, user_uuid: UUID, filters: list[dict] = None): - query = { - "query": { - "bool": { - "should": [ - {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, - {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, - ] - }, - } - } - if filters: - query["query"]["bool"]["filter"] = filters - return query - - @classmethod - def get_with_parent_file_filter(cls, file_name: str): - return { - "term": {"metadata.file_name.keyword": file_name}, - } diff --git a/redbox-core/redbox/storage/storage_handler.py b/redbox-core/redbox/storage/storage_handler.py deleted file mode 100644 index d14e3bafe..000000000 --- a/redbox-core/redbox/storage/storage_handler.py +++ /dev/null @@ -1,62 +0,0 @@ -from abc import ABC, abstractmethod -from typing import ClassVar -from uuid import UUID - -from redbox.models import File -from redbox.models.base import PersistableModel - - -class BaseStorageHandler(ABC): - """Abstract Class for Storage Handler which manages all file and object IO - the Redbox backend. - """ - - # dict comprehension for lowercase class name to class - model_type_map: ClassVar = {v.__name__.lower(): v for v in [File]} - - def get_model_by_model_type(self, model_type): - return self.model_type_map[model_type.lower()] - - @abstractmethod - def __init__(self): - """Initialise the storage handler""" - - @abstractmethod - def write_item(self, item: PersistableModel): - """Write an object to a data store""" - - @abstractmethod - def write_items(self, items: list[PersistableModel]): - """Write a list of objects to a data store""" - - @abstractmethod - def read_item(self, item_uuid: UUID, model_type: str): - """Read an object from a data store""" - - @abstractmethod - def read_items(self, item_uuids: list[UUID], model_type: str): - """Read a list of objects from a data store""" - - @abstractmethod - def update_item(self, item: PersistableModel): - """Update an object in a data store""" - - @abstractmethod - def update_items(self, items: list[PersistableModel]): - """Update a list of objects in a data store""" - - @abstractmethod - def delete_item(self, item: PersistableModel): - """Delete an object from a data store""" - - @abstractmethod - def delete_items(self, items: list[PersistableModel]): - """Delete a list of objects from a data store""" - - @abstractmethod - def list_all_items(self, model_type: str, user_uuid: UUID): - """List all objects of a given type from a data store""" - - @abstractmethod - def read_all_items(self, model_type: str, user_uuid: UUID): - """Read all objects of a given type from a data store""" diff --git a/redbox-core/redbox/test/data.py b/redbox-core/redbox/test/data.py index 780d11b9d..88a73d293 100644 --- a/redbox-core/redbox/test/data.py +++ b/redbox-core/redbox/test/data.py @@ -1,7 +1,6 @@ from dataclasses import dataclass, field import datetime import logging -from uuid import UUID from typing import Generator from langchain_core.documents import Document @@ -15,7 +14,6 @@ def generate_docs( - creator_user_uuid: UUID, s3_key: str = "test_data.pdf", page_numbers: list[int] = [1, 2, 3, 4], total_tokens=6000, @@ -26,7 +24,6 @@ def generate_docs( yield Document( page_content=f"Document {i} text", metadata=ChunkMetadata( - creator_user_uuid=creator_user_uuid, index=i, file_name=s3_key, page_number=page_numbers[int(i / number_of_docs) * len(page_numbers)], @@ -52,13 +49,10 @@ def __init__( test_id: str, query: RedboxQuery, test_data: RedboxTestData, - docs_user_uuid_override: UUID | None = None, s3_keys_override: list[str] | None = None, ): # Use separate file_uuids if specified else match the query all_s3_keys = s3_keys_override if s3_keys_override else query.s3_keys - # Use separate user uuid if specific else match the query - docs_user_uuid = docs_user_uuid_override if docs_user_uuid_override else query.user_uuid if ( test_data.expected_llm_response is not None @@ -71,7 +65,6 @@ def __init__( file_generators = [ generate_docs( s3_key=s3_key, - creator_user_uuid=docs_user_uuid, total_tokens=int(test_data.tokens_in_all_docs / len(all_s3_keys)), number_of_docs=int(test_data.number_of_docs / len(all_s3_keys)), chunk_resolution=test_data.chunk_resolution, @@ -84,12 +77,7 @@ def __init__( self.test_id = test_id def get_docs_matching_query(self): - return [ - doc - for doc in self.docs - if doc.metadata["file_name"] in set(self.query.s3_keys) - and doc.metadata["creator_user_uuid"] == self.query.user_uuid - ] + return [doc for doc in self.docs if doc.metadata["file_name"] in set(self.query.s3_keys)] def generate_test_cases(query: RedboxQuery, test_data: list[RedboxTestData], test_id: str) -> list[RedboxChatTestCase]: diff --git a/redbox-core/tests/conftest.py b/redbox-core/tests/conftest.py index 625e96601..8e6cee06f 100644 --- a/redbox-core/tests/conftest.py +++ b/redbox-core/tests/conftest.py @@ -9,8 +9,7 @@ import tiktoken from tiktoken.core import Encoding -from redbox.models import File, Settings -from redbox.storage.elasticsearch import ElasticsearchStorageHandler +from redbox.models import Settings from collections.abc import Generator @@ -82,11 +81,6 @@ def es_client(env: Settings, es_index: str, es_index_file: str) -> Elasticsearch return env.elasticsearch_client() -@pytest.fixture(scope="session") -def es_storage_handler(es_client: Elasticsearch, env: Settings) -> ElasticsearchStorageHandler: - return ElasticsearchStorageHandler(es_client=es_client, root_index=env.elastic_root_index) - - @pytest.fixture(scope="session") def es_vector_store( es_client: Elasticsearch, es_index: str, embedding_model: FakeEmbeddings, env: Settings @@ -158,49 +152,7 @@ def file_pdf_path() -> Path: @pytest.fixture() -def file_belonging_to_alice( - file_pdf_path: Path, alice: UUID, env: Settings, es_storage_handler: ElasticsearchStorageHandler -) -> File: - f = File( - key=file_pdf_path.name, - bucket=env.bucket_name, - creator_user_uuid=alice, - ) - es_storage_handler.write_item(f) - es_storage_handler.refresh() - return f - - -@pytest.fixture() -def file_belonging_to_bob( - file_pdf_path: Path, bob: UUID, env: Settings, es_storage_handler: ElasticsearchStorageHandler -) -> File: - f = File( - key=file_pdf_path.name, - bucket=env.bucket_name, - creator_user_uuid=bob, - ) - es_storage_handler.write_item(f) - es_storage_handler.refresh() - return f - - -@pytest.fixture() -def file_belonging_to_claire( - file_pdf_path: Path, claire: UUID, env: Settings, es_storage_handler: ElasticsearchStorageHandler -) -> File: - f = File( - key=file_pdf_path.name, - bucket=env.bucket_name, - creator_user_uuid=claire, - ) - es_storage_handler.write_item(f) - es_storage_handler.refresh() - return f - - -@pytest.fixture() -def file(s3_client: S3Client, file_pdf_path: Path, alice: UUID, env: Settings) -> File: +def file(s3_client: S3Client, file_pdf_path: Path, alice: UUID, env: Settings) -> str: file_name = file_pdf_path.name file_type = file_pdf_path.suffix @@ -212,7 +164,7 @@ def file(s3_client: S3Client, file_pdf_path: Path, alice: UUID, env: Settings) - Tagging=f"file_type={file_type}", ) - return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=alice) + return file_name @pytest.fixture(params=ALL_CHUNKS_RETRIEVER_CASES) diff --git a/redbox-core/tests/graph/test_patterns.py b/redbox-core/tests/graph/test_patterns.py index 07a8282e1..86ffcf40c 100644 --- a/redbox-core/tests/graph/test_patterns.py +++ b/redbox-core/tests/graph/test_patterns.py @@ -279,7 +279,7 @@ def test_empty_process(): """Tests the empty process doesn't touch the state whatsoever.""" state = RedboxState( request=RedboxQuery(question="What is AI?", s3_keys=[], user_uuid=uuid4(), chat_history=[]), - documents=structure_documents([doc for doc in generate_docs(s3_key="s3_key", creator_user_uuid=uuid4())]), + documents=structure_documents([doc for doc in generate_docs(s3_key="s3_key")]), text="Foo", route_name=ChatRoute.chat_with_docs_map_reduce, ) @@ -299,7 +299,7 @@ def test_empty_process(): CLEAR_DOC_TEST_CASES = [ RedboxState( request=RedboxQuery(question="What is AI?", file_uuids=[], user_uuid=uuid4(), chat_history=[]), - documents=structure_documents([doc for doc in generate_docs(s3_key="s3_key", creator_user_uuid=uuid4())]), + documents=structure_documents([doc for doc in generate_docs(s3_key="s3_key")]), text="Foo", route_name=ChatRoute.chat_with_docs_map_reduce, ), diff --git a/redbox-core/tests/storage/__init__.py b/redbox-core/tests/storage/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/redbox-core/tests/storage/test_elasticsearch.py b/redbox-core/tests/storage/test_elasticsearch.py deleted file mode 100644 index 9058db583..000000000 --- a/redbox-core/tests/storage/test_elasticsearch.py +++ /dev/null @@ -1,149 +0,0 @@ -from uuid import UUID, uuid4 - -import pytest -from elasticsearch import NotFoundError, Elasticsearch - -from redbox.models import File -from redbox.storage.elasticsearch import ElasticsearchStorageHandler - - -def test_elasticsearch_client_connection(es_client: Elasticsearch, es_storage_handler: ElasticsearchStorageHandler): - """ - Given that I have a valid Elasticsearch client - When I call the info method - Then I expect to see a valid response - - This test is to check all our following elasticsearch tests can proceed. - """ - conn_test_resp = es_client.info() - assert conn_test_resp["tagline"] == "You Know, for Search" - - assert isinstance(es_storage_handler.model_type_map, dict) - - -def test_elasticsearch_write_read_item(es_storage_handler: ElasticsearchStorageHandler, file_belonging_to_alice: File): - """ - Given that `File` is a valid model - When I - Then I expect a valid File to be returned on read" - """ - # Write the file - es_storage_handler.write_item(item=file_belonging_to_alice) - - # Read the File - item_read = es_storage_handler.read_item(file_belonging_to_alice.uuid, "File") - - assert item_read.uuid == file_belonging_to_alice.uuid - - -def test_elastic_delete_item_fail( - es_storage_handler: ElasticsearchStorageHandler, -): - """ - Given that I have an non-existent item uuid - When I call delete_item on it - Then I expect to see a NotFoundError error raised - """ - with pytest.raises(NotFoundError): - es_storage_handler.delete_item(File(uuid=uuid4(), creator_user_uuid=uuid4(), key="", bucket="")) - - -def test_elastic_read_item_fail( - es_storage_handler: ElasticsearchStorageHandler, -): - """ - Given that I have an non-existent item uuid - When I call read_item on its uuid - Then I expect to see a NotFoundError error raised - """ - with pytest.raises(NotFoundError): - es_storage_handler.read_item(uuid4(), "File") - - -def test_elastic_write_read_delete_items(es_storage_handler: ElasticsearchStorageHandler): - """ - Given that I have a list of items - When I call write_items on them - Then I expect to see them written to the database - """ - creator_user_uuid = uuid4() - files = [File(creator_user_uuid=creator_user_uuid, key=f"somefile-{i}.txt", bucket="a-bucket") for i in range(10)] - - es_storage_handler.write_items(files) - - read_files = es_storage_handler.read_items([file.uuid for file in files], "File") - - assert read_files == files - - # Delete the files - es_storage_handler.delete_items(files) - - # Check that the files are deleted - items_left = es_storage_handler.list_all_items("File", creator_user_uuid) - - assert all(file.uuid not in items_left for file in files) - - -def test_list_all_items( - es_storage_handler: ElasticsearchStorageHandler, - file_belonging_to_alice: File, - file_belonging_to_bob: File, - alice: UUID, -): - """ - Given that I have - * a saved file belonging to Alice - * a saved file belonging to Bob - When I call list_all_items as alice - Then I expect to see the uuids of the saved objects that belong to alice returned - """ - uuids = es_storage_handler.list_all_items("File", alice) - assert len(uuids) == 1, f"Unexpected number of files {len(uuids)}" - - -def test_read_all_items( - es_storage_handler: ElasticsearchStorageHandler, - file_belonging_to_alice: File, - file_belonging_to_bob: File, - alice: UUID, -): - """ - Given that I have - * a saved file belonging to Alice - * a saved file belonging to Bob - When I call read_all_items as alice - Then I expect to see the one File belonging to alice - """ - files = es_storage_handler.read_all_items("File", alice) - assert len(files) == 1 - assert files[0].creator_user_uuid == alice - - -def test_elastic_delete_item(es_storage_handler: ElasticsearchStorageHandler, file_belonging_to_alice: File): - """ - Given that I have a saved object - When I call delete_item on it - Then I expect to not be able to read the item - """ - es_storage_handler.delete_item(file_belonging_to_alice) - - with pytest.raises(NotFoundError): - es_storage_handler.read_item(file_belonging_to_alice.uuid, "File") - - -def test_elastic_delete_user_item( - es_storage_handler: ElasticsearchStorageHandler, file_belonging_to_alice: File, alice: UUID -): - """ - Given that I have a saved object - When I call delete_item on it - Then I expect to not be able to read the item - """ - files = es_storage_handler.read_all_items("File", alice) - assert len(files) == 1 - assert files[0].creator_user_uuid == alice - - es_storage_handler.delete_user_items("file", alice) - es_storage_handler.refresh() - files = es_storage_handler.read_all_items("File", alice) - assert len(files) == 0 diff --git a/redbox-core/tests/test_ingest.py b/redbox-core/tests/test_ingest.py index 4d434b049..1a8259509 100644 --- a/redbox-core/tests/test_ingest.py +++ b/redbox-core/tests/test_ingest.py @@ -1,6 +1,5 @@ from typing import TYPE_CHECKING from pathlib import Path -from uuid import uuid4, UUID from typing import Any import pytest @@ -11,7 +10,6 @@ from elasticsearch import Elasticsearch from unittest.mock import MagicMock, patch -from redbox.models.file import File, ProcessingStatusEnum from redbox.loader import ingester from redbox.loader.ingester import ingest_file @@ -29,7 +27,7 @@ S3Client = object -def file_to_s3(filename: str, s3_client: S3Client, env: Settings) -> File: +def file_to_s3(filename: str, s3_client: S3Client, env: Settings) -> str: file_path = Path(__file__).parents[2] / "tests" / "data" / filename file_name = file_path.name file_type = file_path.suffix @@ -42,11 +40,11 @@ def file_to_s3(filename: str, s3_client: S3Client, env: Settings) -> File: Tagging=f"file_type={file_type}", ) - return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=uuid4()) + return file_name -def make_file_query(user_uuid: UUID, file_name: str, resolution: ChunkResolution | None = None) -> dict[str, Any]: - query_filter = make_query_filter(user_uuid, [file_name], resolution) +def make_file_query(file_name: str, resolution: ChunkResolution | None = None) -> dict[str, Any]: + query_filter = make_query_filter([file_name], resolution) return {"query": {"bool": {"must": [{"match_all": {}}], "filter": query_filter}}} @@ -133,15 +131,15 @@ def test_ingest_from_loader( monkeypatch.setattr(ingester, "get_embeddings", lambda _: FakeEmbeddings(size=3072)) # Upload file and call - file = file_to_s3(filename="html/example.html", s3_client=s3_client, env=env) + file_name = file_to_s3(filename="html/example.html", s3_client=s3_client, env=env) ingest_chain = ingest_from_loader( document_loader_type=document_loader_type, s3_client=s3_client, vectorstore=es_vector_store, env=env ) - _ = ingest_chain.invoke(file) + _ = ingest_chain.invoke(file_name) # Test it's written to Elastic - file_query = make_file_query(user_uuid=file.creator_user_uuid, file_name=file.key, resolution=resolution) + file_query = make_file_query(file_name=file_name, resolution=resolution) chunks = list(scan(client=es_client, index=f"{env.elastic_root_index}-chunk", query=file_query)) assert len(chunks) > 0 @@ -157,11 +155,11 @@ def test_ingest_from_loader( @patch("redbox.loader.loaders.requests.post") @pytest.mark.parametrize( - ("filename", "status", "mock_json"), + ("filename", "is_complete", "mock_json"), [ ( "html/example.html", - ProcessingStatusEnum.complete, + True, [ { "type": "CompositeElement", @@ -176,7 +174,7 @@ def test_ingest_from_loader( } ], ), - ("html/corrupt.html", ProcessingStatusEnum.failed, None), + ("html/corrupt.html", False, None), ], ) def test_ingest_file( @@ -186,7 +184,7 @@ def test_ingest_file( monkeypatch: MonkeyPatch, env: Settings, filename: str, - status: ProcessingStatusEnum, + is_complete: bool, mock_json: list | None, ): """ @@ -205,20 +203,17 @@ def test_ingest_file( monkeypatch.setattr(ingester, "get_embeddings", lambda _: FakeEmbeddings(size=3072)) # Upload file and call - file = file_to_s3(filename=filename, s3_client=s3_client, env=env) + filename = file_to_s3(filename=filename, s3_client=s3_client, env=env) - res = ingest_file(file) + res = ingest_file(filename) - if status == ProcessingStatusEnum.failed: + if not is_complete: assert isinstance(res, str) - elif status == ProcessingStatusEnum.complete: + else: assert res is None # Test it's written to Elastic - file_query = make_file_query( - user_uuid=file.creator_user_uuid, - file_name=file.key, - ) + file_query = make_file_query(file_name=filename) chunks = list(scan(client=es_client, index=f"{env.elastic_root_index}-chunk", query=file_query)) assert len(chunks) > 0