Skip to content

Commit

Permalink
Reduz tamanho das queries temáticas para menor uso de heap (#54)
Browse files Browse the repository at this point in the history
Em produção, as queries que geram novas entradas em índices temáticos a
partir de muitos documentos (exemplo: ao criar um novo índice a partir
da base completa) estão atingindo o uso máximo do heap (atualmente 4GB).

Exemplo de log:
```
[2023-10-26T18:20:26,932][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [elasticsearch] attempting to trigger G1GC due to high heap usage [3969385000]
[2023-10-26T18:20:26,963][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [elasticsearch] GC did bring memory usage down, before [3969385000], after [998223872], allocations [43], duration [31]
```

Por conta desse circuit breaker do elasticsearch, o scroll da query
desaparece e eventualmente as queries resultam em erro por falha na
conexão.

Este PR visa diminuir o uso do heap diminuindo a complexidade da query
tanto no parâmetro `size` da query quanto no número de documentos
utilizados para que a query seja realizada.
  • Loading branch information
tigreped authored Oct 27, 2023
2 parents cc44fbf + 366783e commit 2faf34f
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 17 deletions.
35 changes: 18 additions & 17 deletions tasks/gazette_themed_excerpts_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Dict, Iterable, List

from .interfaces import IndexInterface
from .utils import clean_extra_whitespaces, get_documents_from_query_with_highlights
from .utils import batched, clean_extra_whitespaces, get_documents_from_query_with_highlights


def extract_themed_excerpts_from_gazettes(
Expand All @@ -12,21 +12,22 @@ def extract_themed_excerpts_from_gazettes(

ids = []
for theme_query in theme["queries"]:
for excerpt in get_excerpts_from_gazettes_with_themed_query(
theme_query, gazette_ids, index
):
# excerpts with less than 10% of the expected size of excerpt account for
# fewer than 1% of excerpts yet their score is usually high
if len(excerpt["excerpt"]) < 200:
continue

index.index_document(
excerpt,
document_id=excerpt["excerpt_id"],
index=theme["index"],
refresh=True,
)
ids.append(excerpt["excerpt_id"])
for batch in batched(gazette_ids, 500):
for excerpt in get_excerpts_from_gazettes_with_themed_query(
theme_query, batch, index
):
# excerpts with less than 10% of the expected size of excerpt account for
# fewer than 1% of excerpts yet their score is usually high
if len(excerpt["excerpt"]) < 200:
continue

index.index_document(
excerpt,
document_id=excerpt["excerpt_id"],
index=theme["index"],
refresh=True,
)
ids.append(excerpt["excerpt_id"])

return ids

Expand Down Expand Up @@ -152,7 +153,7 @@ def get_es_query_from_themed_query(
) -> Dict:
es_query = {
"query": {"bool": {"must": [], "filter": {"ids": {"values": gazette_ids}}}},
"size": 100,
"size": 10,
"highlight": {
"fields": {
"source_text.with_stopwords": {
Expand Down
1 change: 1 addition & 0 deletions tasks/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
get_documents_from_query_with_highlights,
get_documents_with_ids,
)
from .iter import batched
from .text import clean_extra_whitespaces
11 changes: 11 additions & 0 deletions tasks/utils/iter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from itertools import islice


def batched(iterable, n):
# batched('ABCDEFG', 3) --> ABC DEF G
# pode ser removido ao usar python 3.12, em favor de itertools.batched
if n < 1:
raise ValueError('n must be at least one')
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch

0 comments on commit 2faf34f

Please sign in to comment.