Skip to content

Commit

Permalink
Attempt to address memory leak in doc_loader.py (#89)
Browse files Browse the repository at this point in the history
Signed-off-by: Marcel Klehr <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
marcelklehr and pre-commit-ci[bot] authored Oct 21, 2024
1 parent fae4801 commit be685f6
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 11 deletions.
47 changes: 46 additions & 1 deletion .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,28 +153,73 @@ jobs:
pip install --no-deps -r requirements.txt
cp example.env .env
echo "NEXTCLOUD_URL=http://localhost:8080" >> .env
./main.py | tee backend_logs &
./main.py > backend_logs 2>&1 &
echo $! > ../pid.txt # Save the process ID (PID)
- name: Register backend
run: |
./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080
./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish
- name: Check python memory usage
run: |
ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt
- name: Scan files
run: |
./occ files:scan admin
./occ context_chat:scan admin
- name: Check python memory usage
run: |
ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
ps -p $(cat pid.txt) -o %mem --no-headers > after_scan_mem.txt
- name: Run the prompts
run: |
./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' &
./occ background-job:worker 'OC\TaskProcessing\SynchronousBackgroundJob' &
./occ context_chat:prompt admin "Which factors are taken into account for the Ethical AI Rating?"
./occ context_chat:prompt admin "Welche Faktoren beeinflussen das Ethical AI Rating?"
- name: Check python memory usage
run: |
ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
ps -p $(cat pid.txt) -o %mem --no-headers > after_prompt_mem.txt
- name: Compare memory usage and detect leak
run: |
initial_mem=$(cat initial_mem.txt | tr -d ' ')
final_mem=$(cat after_scan_mem.txt | tr -d ' ')
echo "Initial Memory Usage: $initial_mem%"
echo "Memory Usage after scan: $final_mem%"
if (( $(echo "$final_mem > $initial_mem" | bc -l) )); then
echo "Memory usage has increased during scan. Possible memory leak detected!"
exit 1
else
echo "Memory usage during scan is stable. No memory leak detected."
fi
- name: Compare memory usage and detect leak
run: |
initial_mem=$(cat after_scan_mem.txt | tr -d ' ')
final_mem=$(cat after_prompt_mem.txt | tr -d ' ')
echo "Initial Memory Usage: $initial_mem%"
echo "Memory Usage after prompt: $final_mem%"
if (( $(echo "$final_mem > $initial_mem" | bc -l) )); then
echo "Memory usage has increased during prompt. Possible memory leak detected!"
exit 1
else
echo "Memory usage during prompt is stable. No memory leak detected."
fi
- name: Show logs
if: always()
run: |
tail data/nextcloud.log
echo '--------------------------------------------------'
[ -f context_chat_backend/backend_logs ] && cat context_chat_backend/backend_logs || echo "No backend logs"
26 changes: 16 additions & 10 deletions context_chat_backend/chain/ingest/doc_loader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import gc
import tempfile
import traceback
from collections.abc import Callable
Expand All @@ -17,15 +18,13 @@

def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str:
raw_bytes = file.read()
tmp = tempfile.NamedTemporaryFile(mode='wb')
tmp.write(raw_bytes)
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as tmp:
tmp.write(raw_bytes)
docs = loader(tmp.name)

docs = loader(tmp.name)

tmp.close()
if not tmp.delete:
import os
os.remove(tmp.name)
if not tmp.delete:
import os
os.remove(tmp.name)

if isinstance(docs, str) or isinstance(docs, bytes):
return docs.decode('utf-8') if isinstance(docs, bytes) else docs # pyright: ignore[reportReturnType]
Expand Down Expand Up @@ -127,10 +126,17 @@ def decode_source(source: UploadFile) -> str | None:
return None

if _loader_map.get(mimetype):
return _loader_map[mimetype](source.file)
result = _loader_map[mimetype](source.file)
source.file.close()
return result

return source.file.read().decode('utf-8')
result = source.file.read().decode('utf-8')
source.file.close()
return result
except Exception:
traceback.print_exc()
log_error(f'Error decoding source file ({source.filename})')
return None
finally:
source.file.close() # Ensure file is closed after processing
gc.collect() # Force garbage collection to free up memory

0 comments on commit be685f6

Please sign in to comment.