Skip to content

Commit

Permalink
Add application files
Browse files Browse the repository at this point in the history
  • Loading branch information
n-y-kim committed Mar 10, 2024
1 parent 1f421ff commit 2f933a0
Show file tree
Hide file tree
Showing 27,993 changed files with 3,031,800 additions and 57 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
16 changes: 16 additions & 0 deletions app/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM node:14

# Install Python 3.10
RUN apt-get update && apt-get install -y python3.10

# Set the working directory
WORKDIR /app

# Copy your application code to the container
COPY . .

# Install dependencies
RUN npm install

# Start your application
CMD [ "npm", "start" ]
435 changes: 435 additions & 0 deletions app/backend/app.py

Large diffs are not rendered by default.

File renamed without changes.
216 changes: 216 additions & 0 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import os
from abc import ABC
from dataclasses import dataclass
from typing import Any, AsyncGenerator, Awaitable, Callable, List, Optional, Union, cast
from urllib.parse import urljoin

import aiohttp
from azure.search.documents.aio import SearchClient
from azure.search.documents.models import (
QueryCaptionResult,
QueryType,
VectorizedQuery,
VectorQuery,
)
from openai import AsyncOpenAI

from core.authentication import AuthenticationHelper
from text import nonewlines


@dataclass
class Document:
id: Optional[str]
content: Optional[str]
embedding: Optional[List[float]]
image_embedding: Optional[List[float]]
category: Optional[str]
sourcepage: Optional[str]
sourcefile: Optional[str]
oids: Optional[List[str]]
groups: Optional[List[str]]
captions: List[QueryCaptionResult]

def serialize_for_results(self) -> dict[str, Any]:
return {
"id": self.id,
"content": self.content,
"embedding": Document.trim_embedding(self.embedding),
"imageEmbedding": Document.trim_embedding(self.image_embedding),
"category": self.category,
"sourcepage": self.sourcepage,
"sourcefile": self.sourcefile,
"oids": self.oids,
"groups": self.groups,
"captions": (
[
{
"additional_properties": caption.additional_properties,
"text": caption.text,
"highlights": caption.highlights,
}
for caption in self.captions
]
if self.captions
else []
),
}

@classmethod
def trim_embedding(cls, embedding: Optional[List[float]]) -> Optional[str]:
"""Returns a trimmed list of floats from the vector embedding."""
if embedding:
if len(embedding) > 2:
# Format the embedding list to show the first 2 items followed by the count of the remaining items."""
return f"[{embedding[0]}, {embedding[1]} ...+{len(embedding) - 2} more]"
else:
return str(embedding)

return None


@dataclass
class ThoughtStep:
title: str
description: Optional[Any]
props: Optional[dict[str, Any]] = None


class Approach(ABC):
def __init__(
self,
search_client: SearchClient,
openai_client: AsyncOpenAI,
auth_helper: AuthenticationHelper,
query_language: Optional[str],
query_speller: Optional[str],
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
openai_host: str,
vision_endpoint: str,
vision_token_provider: Callable[[], Awaitable[str]],
):
self.search_client = search_client
self.openai_client = openai_client
self.auth_helper = auth_helper
self.query_language = query_language
self.query_speller = query_speller
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.openai_host = openai_host
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider

def build_filter(self, overrides: dict[str, Any], auth_claims: dict[str, Any]) -> Optional[str]:
exclude_category = overrides.get("exclude_category")
security_filter = self.auth_helper.build_security_filters(overrides, auth_claims)
filters = []
if exclude_category:
filters.append("category ne '{}'".format(exclude_category.replace("'", "''")))
if security_filter:
filters.append(security_filter)
return None if len(filters) == 0 else " and ".join(filters)

async def search(
self,
top: int,
query_text: Optional[str],
filter: Optional[str],
vectors: List[VectorQuery],
use_semantic_ranker: bool,
use_semantic_captions: bool,
) -> List[Document]:
# Use semantic ranker if requested and if retrieval mode is text or hybrid (vectors + text)
if use_semantic_ranker and query_text:
results = await self.search_client.search(
search_text=query_text,
filter=filter,
query_type=QueryType.SEMANTIC,
query_language=self.query_language,
query_speller=self.query_speller,
semantic_configuration_name="default",
top=top,
query_caption="extractive|highlight-false" if use_semantic_captions else None,
vector_queries=vectors,
)
else:
results = await self.search_client.search(
search_text=query_text or "", filter=filter, top=top, vector_queries=vectors
)

documents = []
async for page in results.by_page():
async for document in page:
documents.append(
Document(
id=document.get("id"),
content=document.get("content"),
embedding=document.get("embedding"),
image_embedding=document.get("imageEmbedding"),
category=document.get("category"),
sourcepage=document.get("sourcepage"),
sourcefile=document.get("sourcefile"),
oids=document.get("oids"),
groups=document.get("groups"),
captions=cast(List[QueryCaptionResult], document.get("@search.captions")),
)
)
return documents

def get_sources_content(
self, results: List[Document], use_semantic_captions: bool, use_image_citation: bool
) -> list[str]:
if use_semantic_captions:
return [
(self.get_citation((doc.sourcepage or ""), use_image_citation))
+ ": "
+ nonewlines(" . ".join([cast(str, c.text) for c in (doc.captions or [])]))
for doc in results
]
else:
return [
(self.get_citation((doc.sourcepage or ""), use_image_citation)) + ": " + nonewlines(doc.content or "")
for doc in results
]

def get_citation(self, sourcepage: str, use_image_citation: bool) -> str:
if use_image_citation:
return sourcepage
else:
path, ext = os.path.splitext(sourcepage)
if ext.lower() == ".png":
page_idx = path.rfind("-")
page_number = int(path[page_idx + 1 :])
return f"{path[:page_idx]}.pdf#page={page_number}"

return sourcepage

async def compute_text_embedding(self, q: str):
embedding = await self.openai_client.embeddings.create(
# Azure Open AI takes the deployment name as the model name
model=self.embedding_deployment if self.embedding_deployment else self.embedding_model,
input=q,
)
query_vector = embedding.data[0].embedding
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")

async def compute_image_embedding(self, q: str):
endpoint = urljoin(self.vision_endpoint, "computervision/retrieval:vectorizeText")
headers = {"Content-Type": "application/json"}
params = {"api-version": "2023-02-01-preview", "modelVersion": "latest"}
data = {"text": q}

headers["Authorization"] = "Bearer " + await self.vision_token_provider()

async with aiohttp.ClientSession() as session:
async with session.post(
url=endpoint, params=params, headers=headers, json=data, raise_for_status=True
) as response:
json = await response.json()
image_query_vector = json["vector"]
return VectorizedQuery(vector=image_query_vector, k_nearest_neighbors=50, fields="imageEmbedding")

async def run(
self, messages: list[dict], stream: bool = False, session_state: Any = None, context: dict[str, Any] = {}
) -> Union[dict[str, Any], AsyncGenerator[dict[str, Any], None]]:
raise NotImplementedError
Loading

0 comments on commit 2f933a0

Please sign in to comment.