Reduz tamanho das queries temáticas para menor uso de heap (#54)

Em produção, as queries que geram novas entradas em índices temáticos a partir de muitos documentos (exemplo: ao criar um novo índice a partir da base completa) estão atingindo o uso máximo do heap (atualmente 4GB). Exemplo de log: ``` [2023-10-26T18:20:26,932][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [elasticsearch] attempting to trigger G1GC due to high heap usage [3969385000] [2023-10-26T18:20:26,963][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [elasticsearch] GC did bring memory usage down, before [3969385000], after [998223872], allocations [43], duration [31] ``` Por conta desse circuit breaker do elasticsearch, o scroll da query desaparece e eventualmente as queries resultam em erro por falha na conexão. Este PR visa diminuir o uso do heap diminuindo a complexidade da query tanto no parâmetro `size` da query quanto no número de documentos utilizados para que a query seja realizada.
okfn-brasil · Oct 27, 2023 · 2faf34f · 2faf34f
2 parents cc44fbf + 366783e
commit 2faf34f
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 17 deletions.
diff --git a/tasks/gazette_themed_excerpts_extraction.py b/tasks/gazette_themed_excerpts_extraction.py
@@ -2,7 +2,7 @@
 from typing import Dict, Iterable, List
 
 from .interfaces import IndexInterface
-from .utils import clean_extra_whitespaces, get_documents_from_query_with_highlights
+from .utils import batched, clean_extra_whitespaces, get_documents_from_query_with_highlights
 
 
 def extract_themed_excerpts_from_gazettes(
@@ -12,21 +12,22 @@ def extract_themed_excerpts_from_gazettes(
 
     ids = []
     for theme_query in theme["queries"]:
-        for excerpt in get_excerpts_from_gazettes_with_themed_query(
-            theme_query, gazette_ids, index
-        ):
-            # excerpts with less than 10% of the expected size of excerpt account for 
-            # fewer than 1% of excerpts yet their score is usually high
-            if len(excerpt["excerpt"]) < 200:
-                continue
-
-            index.index_document(
-                excerpt,
-                document_id=excerpt["excerpt_id"],
-                index=theme["index"],
-                refresh=True,
-            )
-            ids.append(excerpt["excerpt_id"])
+        for batch in batched(gazette_ids, 500):
+            for excerpt in get_excerpts_from_gazettes_with_themed_query(
+                theme_query, batch, index
+            ):
+                # excerpts with less than 10% of the expected size of excerpt account for 
+                # fewer than 1% of excerpts yet their score is usually high
+                if len(excerpt["excerpt"]) < 200:
+                    continue
+
+                index.index_document(
+                    excerpt,
+                    document_id=excerpt["excerpt_id"],
+                    index=theme["index"],
+                    refresh=True,
+                )
+                ids.append(excerpt["excerpt_id"])
 
     return ids
 
@@ -152,7 +153,7 @@ def get_es_query_from_themed_query(
 ) -> Dict:
     es_query = {
         "query": {"bool": {"must": [], "filter": {"ids": {"values": gazette_ids}}}},
-        "size": 100,
+        "size": 10,
         "highlight": {
             "fields": {
                 "source_text.with_stopwords": {

diff --git a/tasks/utils/__init__.py b/tasks/utils/__init__.py
@@ -2,4 +2,5 @@
     get_documents_from_query_with_highlights,
     get_documents_with_ids,
 )
+from .iter import batched
 from .text import clean_extra_whitespaces
diff --git a/tasks/utils/iter.py b/tasks/utils/iter.py
@@ -0,0 +1,11 @@
+from itertools import islice
+
+
+def batched(iterable, n):
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    # pode ser removido ao usar python 3.12, em favor de itertools.batched
+    if n < 1:
+        raise ValueError('n must be at least one')
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch