Skip to content

Commit

Permalink
Merge pull request #182 from opensanctions/pudo/perf-stmt
Browse files Browse the repository at this point in the history
try to optimize statement output perf
  • Loading branch information
pudo authored Nov 19, 2024
2 parents 1bc494c + 1c25aa1 commit 2a55e85
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 11 deletions.
32 changes: 23 additions & 9 deletions nomenklatura/statement/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from followthemoney.cli.util import MAX_LINE

from nomenklatura.statement.statement import S
from nomenklatura.util import pack_prop, unpack_prop
from nomenklatura.util import unpack_prop

JSON = "json"
CSV = "csv"
Expand Down Expand Up @@ -167,7 +167,7 @@ def __init__(self, fh: TextIO) -> None:

def write(self, stmt: S) -> None:
row = stmt.to_csv_row()
self._batch.append([row.get(c) for c in CSV_COLUMNS])
self._batch.append([row[c] for c in CSV_COLUMNS])
if len(self._batch) >= CSV_BATCH:
self.writer.writerows(self._batch)
self._batch.clear()
Expand All @@ -189,13 +189,27 @@ def __init__(self, fh: TextIO) -> None:
self._batch: List[List[Optional[str]]] = []

def write(self, stmt: S) -> None:
row = stmt.to_csv_row()
prop = row.pop("prop")
schema = row.pop("schema")
if prop is None or schema is None:
raise ValueError("Cannot pack statement without prop and schema")
row["prop"] = pack_prop(schema, prop)
self._batch.append([row.get(c) for c in PACK_COLUMNS])
# HACK: This is very similar to the CSV writer, but at the very inner
# loop of the application, so we're duplicating code here.
target_value: Optional[str] = "t" if stmt.target else "f"
if stmt.target is None:
target_value = None
external_value: Optional[str] = "t" if stmt.external else "f"
if stmt.external is None:
external_value = None
row = [
stmt.entity_id,
f"{stmt.schema}:{stmt.prop}",
stmt.value,
stmt.dataset,
stmt.lang,
stmt.original_value,
target_value,
external_value,
stmt.first_seen,
stmt.last_seen,
]
self._batch.append(row)
if len(self._batch) >= CSV_BATCH:
self.writer.writerows(self._batch)
self._batch.clear()
Expand Down
4 changes: 2 additions & 2 deletions nomenklatura/xref.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def xref(
limit_factor: int = 10,
scored: bool = True,
external: bool = True,
discount_internal: float = 0.7,
range: Optional[Schema] = None,
auto_threshold: Optional[float] = None,
conflicting_match_threshold: Optional[float] = None,
Expand Down Expand Up @@ -85,9 +86,8 @@ def xref(
if conflict_reporter is not None:
conflict_reporter.check_match(result.score, left_id.id, right_id.id)

# Not sure this is globally a good idea.
if len(left.datasets.intersection(right.datasets)) > 0:
score = score * 0.7
score = score * discount_internal

if auto_threshold is not None and score > auto_threshold:
log.info("Auto-merge [%.2f]: %s <> %s", score, left, right)
Expand Down

0 comments on commit 2a55e85

Please sign in to comment.