nextcloud · kyteinsky · Feb 22, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
@@ -0,0 +1,166 @@
+# SPDX-FileCopyrightText: Nextcloud contributors
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+name: Integration test
+
+on:
+  pull_request:
+    paths:
+      - main.py
+      - config.yaml
+      - context_chat_backend/**
+      - appinfo/**
+      - example.env
+  push:
+    branches:
+      - master
+    paths:
+      - main.py
+      - config.yaml
+      - context_chat_backend/**
+      - appinfo/**
+      - example.env
+
+concurrency:
+  group: integration-test-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+
+jobs:
+  transcription:
+    runs-on: ubuntu-latest
+
+    strategy:
+      # do not stop on another job's failure
+      fail-fast: false
+      matrix:
+        php-versions: [ '8.1' ]
+        databases: [ 'sqlite' ]
+        server-versions: [ 'master', 'stable28' ]
+
+    name: Integration test on ${{ matrix.server-versions }} php@${{ matrix.php-versions }}
+
+    env:
+      MYSQL_PORT: 4444
+      PGSQL_PORT: 4445
+
+    services:
+      mysql:
+        image: mariadb:10.5
+        ports:
+          - 4444:3306/tcp
+        env:
+          MYSQL_ROOT_PASSWORD: rootpassword
+        options: --health-cmd="mysqladmin ping" --health-interval 5s --health-timeout 2s --health-retries 5
+      postgres:
+        image: postgres
+        ports:
+          - 4445:5432/tcp
+        env:
+          POSTGRES_USER: root
+          POSTGRES_PASSWORD: rootpassword
+          POSTGRES_DB: nextcloud
+        options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5
+
+    steps:
+      - name: Checkout server
+        uses: actions/checkout@v4
+        with:
+          repository: nextcloud/server
+          ref: ${{ matrix.server-versions }}
+
+      - name: Checkout submodules
+        shell: bash
+        run: |
+          auth_header="$(git config --local --get http.https://github.com/.extraheader)"
+          git submodule sync --recursive
+          git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
+
+      - name: Set up php ${{ matrix.php-versions }}
+        uses: shivammathur/setup-php@v2
+        with:
+          php-version: ${{ matrix.php-versions }}
+          tools: phpunit
+          extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_mysql, pdo_sqlite, pgsql, pdo_pgsql, gd, zip
+
+      - name: Checkout context_chat php app
+        uses: actions/checkout@v4
+        with:
+          repository: nextcloud/context_chat
+          path: apps/context_chat
+
+      - name: Checkout AppAPI
+        uses: actions/checkout@v4
+        with:
+          repository: cloud-py-api/app_api
+          path: apps/app_api
+
+      - name: Checkout backend
+        uses: actions/checkout@v4
+        with:
+          path: context_chat_backend/
+
+      - name: Set up Nextcloud
+        if: ${{ matrix.databases != 'pgsql'}}
+        run: |
+          sleep 25
+          mkdir data
+          ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$MYSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password
+          php -S localhost:8080 &
+
+      - name: Set up Nextcloud
+        if: ${{ matrix.databases == 'pgsql'}}
+        run: |
+          sleep 25
+          mkdir data
+          ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$PGSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password
+          php -S localhost:8080 &
+
+      - name: Enable context_chat and app_api
+        run: ./occ app:enable -vvv -f context_chat app_api
+
+      - name: Checkout documentation
+        uses: actions/checkout@v4
+        with:
+          repository: nextcloud/documentation
+          path: data/admin/files/documentation
+
+      - name: Prepare docs
+        run: |
+          cd data/admin/files/documentation
+          find ./ -depth -name "*.rst" -exec sh -c 'mv "$1" "${1%.rst}.txt"' _ {} \;
+          git status
+
+      - name: Setup python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install and init backend
+        run: |
+          cd context_chat_backend
+          pip install --no-deps -r reqs.txt
+          cp example.env .env
+          echo "NEXTCLOUD_URL=http://localhost:8080" >> .env
+          ./main.py &> backend_logs &
+
+      - name: Register backend
+        run: |
+          ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080
+          ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"1.1.1\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish
+
+      - name: Scan files
+        run: |
+          ./occ files:scan --all
+          ./occ context_chat:scan -m text/plain admin
+
+      - name: Run prompt
+        run: |
+          ./occ context_chat:prompt admin "Which factors are taken into account for the Ethical AI Rating?"
+
+      - name: Show logs
+        if: always()
+        run: |
+          tail data/nextcloud.log
+          echo '--------------------------------------------------'
+          tail context_chat_backend/backend_logs
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,5 @@
 .venv/
 __pycache__/
 .env
-model_files/*
-vector_db_data/*
+persistent_storage/*
 .vscode/
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,5 @@
 FROM python:3.11-bookworm
 
-VOLUME /app/model_files
-VOLUME /app/vector_db_data
-ENV VECTORDB_DIR /app/vector_db_data
-ENV SENTENCE_TRANSFORMERS_HOME /app/model_files
-ENV TRANSFORMERS_CACHE /app/model_files
-
 RUN apt update && apt install -y --no-install-recommends pandoc
 
 WORKDIR /app

diff --git a/config.yaml b/config.yaml
@@ -1,7 +1,6 @@
 vectordb:
   chroma:
     is_persistent: True
-    persist_directory: ./vector_db_data
     # chroma_server_host:
     # chroma_server_http_port
     # chroma_server_ssl_enabled
@@ -16,7 +15,7 @@ embedding:
       device: cpu
 
   llama:
-    model_path: model_files/dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
+    model_path: dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
     n_batch: 16
     n_ctx: 2048
 
@@ -28,14 +27,14 @@ embedding:
 
 llm:
   llama:
-    model_path: model_files/dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
+    model_path: dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
     n_batch: 10
     n_ctx: 4096
     template: "<|im_start|> system \nYou're an AI assistant good at finding relevant context from documents to answer questions provided by the user.  <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n"
     end_separator: <|im_end|>
 
   ctransformer:
-    model: model_files/dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
+    model: dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
     template: "<|im_start|> system \nYou're an AI assistant good at finding relevant context from documents to answer questions provided by the user.  <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n"
     end_separator: <|im_end|>
 

diff --git a/context_chat_backend/__init__.py b/context_chat_backend/__init__.py
@@ -1,70 +1,40 @@
-from os import getenv
+import os
 
 from dotenv import load_dotenv
-import uvicorn
 
+from .config_parser import get_config
 from .controller import app
-from .download import download_all_models
-from .models import models
+from .download import model_init
 from .utils import to_int
-from .vectordb import vector_dbs
 
 load_dotenv()
 
-__all__ = ['create_server', 'vector_dbs', 'models']
+__all__ = ['app', 'to_int']
 
 
-def create_server(config: dict[str, tuple[str, dict]]):
+def _setup_env_vars():
 	'''
-	Creates a FastAPI server with the given config.
-
-	Args
-	----
-	config: dict
-		A dictionary containing the services to be deployed.
+	Sets up the environment variables for persistent storage.
 	'''
-	if getenv('DISABLE_CUSTOM_DOWNLOAD_URI', '0') != '1':
-		if (model_name := download_all_models(config)) is not None:
-			raise Exception(f'Error: Model download failed for {model_name}')
-
-	app.extra['CONFIG'] = config
-
-	if config.get('embedding'):
-		from .models import init_model
-
-		model = init_model('embedding', config.get('embedding'))
-		app.extra['EMBEDDING_MODEL'] = model
-
-	if config.get('vectordb'):
-		from .vectordb import get_vector_db
+	persistent_storage = os.getenv('APP_PERSISTENT_STORAGE', 'persistent_storage')
 
-		client_klass = get_vector_db(config.get('vectordb')[0])
+	vector_db_dir = os.path.join(persistent_storage, 'vector_db_data')
+	if not os.path.exists(vector_db_dir):
+		os.makedirs(vector_db_dir, 0o750, True)
 
-		if app.extra.get('EMBEDDING_MODEL') is not None:
-			app.extra['VECTOR_DB'] = client_klass(app.extra['EMBEDDING_MODEL'], **config.get('vectordb')[1])
-		else:
-			app.extra['VECTOR_DB'] = client_klass(**config.get('vectordb')[1])
+	model_dir = os.path.join(persistent_storage, 'model_files')
+	if not os.path.exists(model_dir):
+		os.makedirs(model_dir, 0o750, True)
 
-	if config.get('llm'):
-		from .models import init_model
+	os.environ['APP_PERSISTENT_STORAGE'] = persistent_storage
+	os.environ['VECTORDB_DIR'] = vector_db_dir
+	os.environ['MODEL_DIR'] = model_dir
+	os.environ['SENTENCE_TRANSFORMERS_HOME'] = os.getenv('SENTENCE_TRANSFORMERS_HOME', model_dir)
+	os.environ['TRANSFORMERS_CACHE'] = os.getenv('TRANSFORMERS_CACHE', model_dir)
 
-		llm_name, llm_config = config.get('llm')
-		app.extra['LLM_TEMPLATE'] = llm_config.pop('template', '')
-		app.extra['LLM_END_SEPARATOR'] = llm_config.pop('end_separator', '')
 
-		model = init_model('llm', (llm_name, llm_config))
-		app.extra['LLM_MODEL'] = model
+_setup_env_vars()
 
-	uvicorn.run(
-		app=app,
-		host=getenv('APP_HOST', '0.0.0.0'),
-		port=to_int(getenv('APP_PORT'), 9000),
-		http='h11',
-		interface='asgi3',
-		log_level=('warning', 'trace')[getenv('DEBUG', '0') == '1'],
-		use_colors=True,
-		limit_concurrency=100,
-		backlog=100,
-		timeout_keep_alive=10,
-		h11_max_incomplete_event_size=5 * 1024 * 1024,  # 5MB
-	)
+app.extra['CONFIG'] = get_config()
+app.extra['ENABLED'] = model_init(app)
+print('App', 'enabled' if app.extra['ENABLED'] else 'disabled', 'at startup', flush=True)
diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
@@ -122,6 +122,11 @@ def _load_org(file: BinaryIO) -> str:
 
 def decode_source(source: UploadFile) -> str | None:
 	try:
+		# .pot files are powerpoint templates but also plain text files,
+		# so we skip them to prevent decoding errors
+		if source.headers.get('title').endswith('.pot'):
+			return None
+
 		if _loader_map.get(source.headers.get('type')):
 			return _loader_map[source.headers.get('type')](source.file)
 

diff --git a/context_chat_backend/config_parser.py b/context_chat_backend/config_parser.py
@@ -0,0 +1,58 @@
+from pprint import pprint
+
+from ruamel.yaml import YAML
+
+from .models import models
+from .vectordb import vector_dbs
+
+
+def _first_in_list(
+	input_dict: dict[str, dict],
+	supported_list: list[str]
+) -> tuple[str, dict] | None:
+	'''
+	Find the first matching item in the input list from the supported list.
+	This is done to find the first supported item in the config file.
+	'''
+	for input_item, value in input_dict.items():
+		if input_item in supported_list:
+			return (input_item, value or {})
+
+	return None
+
+
+def get_config(file_path: str = 'config.yaml') -> dict[str, tuple[str, dict]]:
+	'''
+	Get the config from the given file path (relative to the root directory).
+	'''
+	with open(file_path) as f:
+		try:
+			yaml = YAML(typ='safe')
+			config: dict = yaml.load(f)
+		except Exception as e:
+			raise AssertionError('Error: could not load config from', file_path, 'file') from e
+
+	selected_config = {
+		'vectordb': _first_in_list(config.get('vectordb', {}), vector_dbs),
+		'embedding': _first_in_list(config.get('embedding', {}), models['embedding']),
+		'llm': _first_in_list(config.get('llm', {}), models['llm']),
+	}
+
+	if not selected_config['vectordb']:
+		raise AssertionError(
+			f'Error: vectordb should be at least one of {vector_dbs} in the config file'
+		)
+
+	if not selected_config['embedding']:
+		raise AssertionError(
+			f'Error: embedding model should be at least one of {models["embedding"]} in the config file'
+		)
+
+	if not selected_config['llm']:
+		raise AssertionError(
+			f'Error: llm model should be at least one of {models["llm"]} in the config file'
+		)
+
+	pprint(f'Selected config: {selected_config}')
+
+	return selected_config