Skip to content

Commit

Permalink
fix: better logging
Browse files Browse the repository at this point in the history
Signed-off-by: Anupam Kumar <[email protected]>
  • Loading branch information
kyteinsky committed Jan 17, 2025
1 parent a936e85 commit bd9bf8b
Show file tree
Hide file tree
Showing 27 changed files with 695 additions and 381 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -237,5 +237,5 @@ jobs:
echo '--------------------------------------------------'
[ -f context_chat_backend/backend_logs ] && cat context_chat_backend/backend_logs || echo "No backend logs"
echo '--------------------------------------------------'
[ -f context_chat_backend/embedding_model.log ] && cat context_chat_backend/embedding_model.log || echo "No embedding backend logs"
tail -v -n +1 logs/* || echo "No logs in logs directory"
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ __pycache__/
.env
persistent_storage/*
.vscode/
embedding_model.log
logs/
1 change: 1 addition & 0 deletions config.cpu.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
debug: true
uvicorn_log_level: info
disable_aaa: false
httpx_verify_ssl: true
use_colors: true
Expand Down
1 change: 1 addition & 0 deletions config.gpu.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
debug: true
uvicorn_log_level: info
disable_aaa: false
httpx_verify_ssl: true
use_colors: true
Expand Down
5 changes: 5 additions & 0 deletions context_chat_backend/chain/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
#
import logging

from langchain.schema import Document

from ..vectordb.base import BaseVectorDB
from .types import ContextException, ScopeType

logger = logging.getLogger('ccb.chain')

def get_context_docs(
user_id: str,
Expand All @@ -18,11 +21,13 @@ def get_context_docs(
) -> list[Document]:
# unscoped search
if not scope_type:
logger.debug('Searching for context docs without scope')
return vectordb.doc_search(user_id, query, ctx_limit)

if not scope_list:
raise ContextException('Error: scope list must be provided and not empty if scope type is provided')

logger.debug('Searching for context docs with scope')
return vectordb.doc_search(user_id, query, ctx_limit, scope_type, scope_list)


Expand Down
7 changes: 3 additions & 4 deletions context_chat_backend/chain/ingest/doc_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
#

import logging
import re
import tempfile
import traceback
from collections.abc import Callable
from logging import error as log_error
from typing import BinaryIO

import docx2txt
Expand All @@ -19,6 +18,7 @@
from pypdf import PdfReader
from striprtf import striprtf

logger = logging.getLogger('ccb.doc_loader')

def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str:
raw_bytes = file.read()
Expand Down Expand Up @@ -138,8 +138,7 @@ def decode_source(source: UploadFile) -> str | None:
source.file.close()
return result
except Exception:
traceback.print_exc()
log_error(f'Error decoding source file ({source.filename})')
logger.exception(f'Error decoding source file ({source.filename})', stack_info=True)
return None
finally:
source.file.close() # Ensure file is closed after processing
51 changes: 34 additions & 17 deletions context_chat_backend/chain/ingest/injest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
#
import logging
import re

from fastapi.datastructures import UploadFile
Expand All @@ -17,6 +18,7 @@
from .doc_splitter import get_splitter_for
from .mimetype_list import SUPPORTED_MIMETYPES

logger = logging.getLogger('ccb.injest')

def _allowed_file(file: UploadFile) -> bool:
return file.headers['type'] in SUPPORTED_MIMETYPES
Expand Down Expand Up @@ -52,13 +54,13 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
indocuments = []

for source in sources:
print('processing source:', source.filename, flush=True)
logger.debug('processing source', extra={ 'source_id': source.filename })

# transform the source to have text data
content = decode_source(source)

if content is None or content == '':
print('decoded empty source:', source.filename, flush=True)
logger.debug('decoded empty source', extra={ 'source_id': source.filename })
continue

# replace more than two newlines with two newlines (also blank spaces, more than 4)
Expand All @@ -69,10 +71,10 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
content = content.replace('\0', '')

if content is None or content == '':
print('decoded empty source after cleanup:', source.filename, flush=True)
logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.filename })
continue

print('decoded non empty source:', source.filename, flush=True)
logger.debug('decoded non empty source', extra={ 'source_id': source.filename })

metadata = {
'source': source.filename,
Expand All @@ -83,6 +85,10 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[

splitter = get_splitter_for(config.embedding_chunk_size, source.headers['type'])
split_docs = splitter.split_documents([doc])
logger.debug('split document into chunks', extra={
'source_id': source.filename,
'len(split_docs)': len(split_docs),
})

indocuments.append(InDocument(
documents=split_docs,
Expand All @@ -105,11 +111,19 @@ def _process_sources(
Returns the list of source ids that were successfully added.
'''
existing_sources, filtered_sources = _filter_sources(vectordb, sources)
logger.debug('db filter source results', extra={
'len(existing_sources)': len(existing_sources),
'existing_sources': existing_sources,
'len(filtered_sources)': len(filtered_sources),
'filtered_sources': filtered_sources,
})

# update userIds for existing sources
# allow the userIds as additional users, not as the only users
if len(existing_sources) > 0:
print('Increasing access for existing sources:', [source.filename for source in existing_sources], flush=True)
logger.debug('Increasing access for existing sources', extra={
'source_ids': [source.filename for source in existing_sources]
})
for source in existing_sources:
try:
vectordb.update_access(
Expand All @@ -118,34 +132,36 @@ def _process_sources(
source.filename, # pyright: ignore[reportArgumentType]
)
except SafeDbException as e:
print('Failed to update access for source (', source.filename, '):', e.args[0], flush=True)
logger.error(f'Failed to update access for source ({source.filename}): {e.args[0]}')
continue

if len(filtered_sources) == 0:
# no new sources to embed
print('Filtered all sources, nothing to embed', flush=True)
logger.debug('Filtered all sources, nothing to embed')
return []

print('Filtered sources:', [source.filename for source in filtered_sources], flush=True)
logger.debug('Filtered sources:', extra={
'source_ids': [source.filename for source in filtered_sources]
})
indocuments = _sources_to_indocuments(config, filtered_sources)

print('Converted sources to documents')
logger.debug('Converted all sources to documents')

if len(indocuments) == 0:
# document(s) were empty, not an error
print('All documents were found empty after being processed', flush=True)
logger.debug('All documents were found empty after being processed')
return []

added_sources = vectordb.add_indocuments(indocuments)
print('Added documents to vectordb', flush=True)
logger.debug('Added documents to vectordb')
return added_sources


def _decode_latin_1(s: str) -> str:
try:
return s.encode('latin-1').decode('utf-8')
except UnicodeDecodeError:
print('Failed to decode latin-1:', s, flush=True)
logger.error('Failed to decode latin-1: %s', s)
return s


Expand All @@ -161,11 +177,12 @@ def embed_sources(
or _allowed_file(source)
]

print(
'Embedding sources:\n' +
'\n'.join([f'{source.filename} ({_decode_latin_1(source.headers["title"])})' for source in sources_filtered]),
flush=True,
)
logger.debug('Embedding sources:', extra={
'source_ids': [
f'{source.filename} ({_decode_latin_1(source.headers["title"])})'
for source in sources_filtered
],
})

vectordb = vectordb_loader.load()
return _process_sources(vectordb, config, sources_filtered)
8 changes: 7 additions & 1 deletion context_chat_backend/chain/one_shot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
#
import logging

from langchain.llms.base import LLM

from ..dyn_loader import VectorDBLoader
Expand All @@ -16,6 +18,7 @@
{question}
''' # noqa: E501

logger = logging.getLogger('ccb.chain')

def process_query(
user_id: str,
Expand Down Expand Up @@ -65,7 +68,10 @@ def process_context_query(
raise ContextException('No documents retrieved, please index a few documents first')

context_chunks = get_context_chunks(context_docs)
print('len(context_chunks)', len(context_chunks), flush=True)
logger.debug('context retrieved', extra={
'len(context_docs)': len(context_docs),
'len(context_chunks)': len(context_chunks),
})

output = llm.invoke(
get_pruned_query(llm, app_config, query, template or _LLM_TEMPLATE, context_chunks),
Expand Down
15 changes: 9 additions & 6 deletions context_chat_backend/chain/query_proc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
#
import logging

from langchain.llms.base import LLM

from ..types import TConfig

logger = logging.getLogger('ccb.chain')

def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_chunks: list[str]) -> str:
'''
Expand Down Expand Up @@ -60,11 +63,11 @@ def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_
accepted_chunks.append(context)
remaining_tokens -= context_tokens

print(
'total tokens:', n_ctx - remaining_tokens,
'remaining tokens:', remaining_tokens,
'accepted chunks:', len(accepted_chunks),
flush=True,
)
logger.debug('pruned query stats', extra={
'total tokens': n_ctx - remaining_tokens,
'remaining tokens': remaining_tokens,
'accepted chunks': len(accepted_chunks),
'total chunks': len(text_chunks),
})

return template.format(context='\n\n'.join(accepted_chunks), question=query)
1 change: 1 addition & 0 deletions context_chat_backend/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def get_config(file_path: str) -> TConfig:

return TConfig(
debug=config.get('debug', False),
uvicorn_log_level=config.get('uvicorn_log_level', 'info'),
disable_aaa=config.get('disable_aaa', False),
httpx_verify_ssl=config.get('httpx_verify_ssl', True),
use_colors=config.get('use_colors', True),
Expand Down
Loading

0 comments on commit bd9bf8b

Please sign in to comment.