fix: better logging

Signed-off-by: Anupam Kumar <[email protected]>
nextcloud · Jan 17, 2025 · bd9bf8b · bd9bf8b
1 parent a936e85
commit bd9bf8b
Show file tree

Hide file tree

Showing 27 changed files with 695 additions and 381 deletions.
diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
@@ -237,5 +237,5 @@ jobs:
           echo '--------------------------------------------------'
           [ -f context_chat_backend/backend_logs ] && cat context_chat_backend/backend_logs || echo "No backend logs"
           echo '--------------------------------------------------'
-          [ -f context_chat_backend/embedding_model.log ] && cat context_chat_backend/embedding_model.log || echo "No embedding backend logs"
+          tail -v -n +1 logs/* || echo "No logs in logs directory"
           
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,4 @@ __pycache__/
 .env
 persistent_storage/*
 .vscode/
-embedding_model.log
+logs/
diff --git a/config.cpu.yaml b/config.cpu.yaml
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 debug: true
+uvicorn_log_level: info
 disable_aaa: false
 httpx_verify_ssl: true
 use_colors: true

diff --git a/config.gpu.yaml b/config.gpu.yaml
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 debug: true
+uvicorn_log_level: info
 disable_aaa: false
 httpx_verify_ssl: true
 use_colors: true

diff --git a/context_chat_backend/chain/context.py b/context_chat_backend/chain/context.py
@@ -2,11 +2,14 @@
 # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import logging
+
 from langchain.schema import Document
 
 from ..vectordb.base import BaseVectorDB
 from .types import ContextException, ScopeType
 
+logger = logging.getLogger('ccb.chain')
 
 def get_context_docs(
 	user_id: str,
@@ -18,11 +21,13 @@ def get_context_docs(
 ) -> list[Document]:
 	# unscoped search
 	if not scope_type:
+		logger.debug('Searching for context docs without scope')
 		return vectordb.doc_search(user_id, query, ctx_limit)
 
 	if not scope_list:
 		raise ContextException('Error: scope list must be provided and not empty if scope type is provided')
 
+	logger.debug('Searching for context docs with scope')
 	return vectordb.doc_search(user_id, query, ctx_limit, scope_type, scope_list)
 
 

diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
@@ -3,11 +3,10 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 
+import logging
 import re
 import tempfile
-import traceback
 from collections.abc import Callable
-from logging import error as log_error
 from typing import BinaryIO
 
 import docx2txt
@@ -19,6 +18,7 @@
 from pypdf import PdfReader
 from striprtf import striprtf
 
+logger = logging.getLogger('ccb.doc_loader')
 
 def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str:
 	raw_bytes = file.read()
@@ -138,8 +138,7 @@ def decode_source(source: UploadFile) -> str | None:
 		source.file.close()
 		return result
 	except Exception:
-		traceback.print_exc()
-		log_error(f'Error decoding source file ({source.filename})')
+		logger.exception(f'Error decoding source file ({source.filename})', stack_info=True)
 		return None
 	finally:
 		source.file.close()  # Ensure file is closed after processing
diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import logging
 import re
 
 from fastapi.datastructures import UploadFile
@@ -17,6 +18,7 @@
 from .doc_splitter import get_splitter_for
 from .mimetype_list import SUPPORTED_MIMETYPES
 
+logger = logging.getLogger('ccb.injest')
 
 def _allowed_file(file: UploadFile) -> bool:
 	return file.headers['type'] in SUPPORTED_MIMETYPES
@@ -52,13 +54,13 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
 	indocuments = []
 
 	for source in sources:
-		print('processing source:', source.filename, flush=True)
+		logger.debug('processing source', extra={ 'source_id': source.filename })
 
 		# transform the source to have text data
 		content = decode_source(source)
 
 		if content is None or content == '':
-			print('decoded empty source:', source.filename, flush=True)
+			logger.debug('decoded empty source', extra={ 'source_id': source.filename })
 			continue
 
 		# replace more than two newlines with two newlines (also blank spaces, more than 4)
@@ -69,10 +71,10 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
 		content = content.replace('\0', '')
 
 		if content is None or content == '':
-			print('decoded empty source after cleanup:', source.filename, flush=True)
+			logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.filename })
 			continue
 
-		print('decoded non empty source:', source.filename, flush=True)
+		logger.debug('decoded non empty source', extra={ 'source_id': source.filename })
 
 		metadata = {
 			'source': source.filename,
@@ -83,6 +85,10 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
 
 		splitter = get_splitter_for(config.embedding_chunk_size, source.headers['type'])
 		split_docs = splitter.split_documents([doc])
+		logger.debug('split document into chunks', extra={
+			'source_id': source.filename,
+			'len(split_docs)': len(split_docs),
+		})
 
 		indocuments.append(InDocument(
 			documents=split_docs,
@@ -105,11 +111,19 @@ def _process_sources(
 	Returns the list of source ids that were successfully added.
 	'''
 	existing_sources, filtered_sources = _filter_sources(vectordb, sources)
+	logger.debug('db filter source results', extra={
+		'len(existing_sources)': len(existing_sources),
+		'existing_sources': existing_sources,
+		'len(filtered_sources)': len(filtered_sources),
+		'filtered_sources': filtered_sources,
+	})
 
 	# update userIds for existing sources
 	# allow the userIds as additional users, not as the only users
 	if len(existing_sources) > 0:
-		print('Increasing access for existing sources:', [source.filename for source in existing_sources], flush=True)
+		logger.debug('Increasing access for existing sources', extra={
+			'source_ids': [source.filename for source in existing_sources]
+		})
 		for source in existing_sources:
 			try:
 				vectordb.update_access(
@@ -118,34 +132,36 @@ def _process_sources(
 					source.filename,  # pyright: ignore[reportArgumentType]
 				)
 			except SafeDbException as e:
-				print('Failed to update access for source (', source.filename, '):', e.args[0], flush=True)
+				logger.error(f'Failed to update access for source ({source.filename}): {e.args[0]}')
 				continue
 
 	if len(filtered_sources) == 0:
 		# no new sources to embed
-		print('Filtered all sources, nothing to embed', flush=True)
+		logger.debug('Filtered all sources, nothing to embed')
 		return []
 
-	print('Filtered sources:', [source.filename for source in filtered_sources], flush=True)
+	logger.debug('Filtered sources:', extra={
+		'source_ids': [source.filename for source in filtered_sources]
+	})
 	indocuments = _sources_to_indocuments(config, filtered_sources)
 
-	print('Converted sources to documents')
+	logger.debug('Converted all sources to documents')
 
 	if len(indocuments) == 0:
 		# document(s) were empty, not an error
-		print('All documents were found empty after being processed', flush=True)
+		logger.debug('All documents were found empty after being processed')
 		return []
 
 	added_sources = vectordb.add_indocuments(indocuments)
-	print('Added documents to vectordb', flush=True)
+	logger.debug('Added documents to vectordb')
 	return added_sources
 
 
 def _decode_latin_1(s: str) -> str:
 	try:
 		return s.encode('latin-1').decode('utf-8')
 	except UnicodeDecodeError:
-		print('Failed to decode latin-1:', s, flush=True)
+		logger.error('Failed to decode latin-1: %s', s)
 		return s
 
 
@@ -161,11 +177,12 @@ def embed_sources(
 		or _allowed_file(source)
 	]
 
-	print(
-		'Embedding sources:\n' +
-		'\n'.join([f'{source.filename} ({_decode_latin_1(source.headers["title"])})' for source in sources_filtered]),
-		flush=True,
-	)
+	logger.debug('Embedding sources:', extra={
+		'source_ids': [
+			f'{source.filename} ({_decode_latin_1(source.headers["title"])})'
+			for source in sources_filtered
+		],
+	})
 
 	vectordb = vectordb_loader.load()
 	return _process_sources(vectordb, config, sources_filtered)
diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import logging
+
 from langchain.llms.base import LLM
 
 from ..dyn_loader import VectorDBLoader
@@ -16,6 +18,7 @@
 {question}
 ''' # noqa: E501
 
+logger = logging.getLogger('ccb.chain')
 
 def process_query(
 	user_id: str,
@@ -65,7 +68,10 @@ def process_context_query(
 		raise ContextException('No documents retrieved, please index a few documents first')
 
 	context_chunks = get_context_chunks(context_docs)
-	print('len(context_chunks)', len(context_chunks), flush=True)
+	logger.debug('context retrieved', extra={
+		'len(context_docs)': len(context_docs),
+		'len(context_chunks)': len(context_chunks),
+	})
 
 	output = llm.invoke(
 		get_pruned_query(llm, app_config, query, template or _LLM_TEMPLATE, context_chunks),

diff --git a/context_chat_backend/chain/query_proc.py b/context_chat_backend/chain/query_proc.py
@@ -2,10 +2,13 @@
 # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import logging
+
 from langchain.llms.base import LLM
 
 from ..types import TConfig
 
+logger = logging.getLogger('ccb.chain')
 
 def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_chunks: list[str]) -> str:
 	'''
@@ -60,11 +63,11 @@ def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_
 			accepted_chunks.append(context)
 			remaining_tokens -= context_tokens
 
-	print(
-		'total tokens:', n_ctx - remaining_tokens,
-		'remaining tokens:', remaining_tokens,
-		'accepted chunks:', len(accepted_chunks),
-		flush=True,
-	)
+	logger.debug('pruned query stats', extra={
+		'total tokens': n_ctx - remaining_tokens,
+		'remaining tokens': remaining_tokens,
+		'accepted chunks': len(accepted_chunks),
+		'total chunks': len(text_chunks),
+	})
 
 	return template.format(context='\n\n'.join(accepted_chunks), question=query)
diff --git a/context_chat_backend/config_parser.py b/context_chat_backend/config_parser.py
@@ -49,6 +49,7 @@ def get_config(file_path: str) -> TConfig:
 
 	return TConfig(
 		debug=config.get('debug', False),
+		uvicorn_log_level=config.get('uvicorn_log_level', 'info'),
 		disable_aaa=config.get('disable_aaa', False),
 		httpx_verify_ssl=config.get('httpx_verify_ssl', True),
 		use_colors=config.get('use_colors', True),