Merge pull request #182 from opensanctions/pudo/perf-stmt

try to optimize statement output perf
opensanctions · Nov 19, 2024 · 2a55e85 · 2a55e85
2 parents 1bc494c + 1c25aa1
commit 2a55e85
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 11 deletions.
diff --git a/nomenklatura/statement/serialize.py b/nomenklatura/statement/serialize.py
@@ -9,7 +9,7 @@
 from followthemoney.cli.util import MAX_LINE
 
 from nomenklatura.statement.statement import S
-from nomenklatura.util import pack_prop, unpack_prop
+from nomenklatura.util import unpack_prop
 
 JSON = "json"
 CSV = "csv"
@@ -167,7 +167,7 @@ def __init__(self, fh: TextIO) -> None:
 
     def write(self, stmt: S) -> None:
         row = stmt.to_csv_row()
-        self._batch.append([row.get(c) for c in CSV_COLUMNS])
+        self._batch.append([row[c] for c in CSV_COLUMNS])
         if len(self._batch) >= CSV_BATCH:
             self.writer.writerows(self._batch)
             self._batch.clear()
@@ -189,13 +189,27 @@ def __init__(self, fh: TextIO) -> None:
         self._batch: List[List[Optional[str]]] = []
 
     def write(self, stmt: S) -> None:
-        row = stmt.to_csv_row()
-        prop = row.pop("prop")
-        schema = row.pop("schema")
-        if prop is None or schema is None:
-            raise ValueError("Cannot pack statement without prop and schema")
-        row["prop"] = pack_prop(schema, prop)
-        self._batch.append([row.get(c) for c in PACK_COLUMNS])
+        # HACK: This is very similar to the CSV writer, but at the very inner
+        # loop of the application, so we're duplicating code here.
+        target_value: Optional[str] = "t" if stmt.target else "f"
+        if stmt.target is None:
+            target_value = None
+        external_value: Optional[str] = "t" if stmt.external else "f"
+        if stmt.external is None:
+            external_value = None
+        row = [
+            stmt.entity_id,
+            f"{stmt.schema}:{stmt.prop}",
+            stmt.value,
+            stmt.dataset,
+            stmt.lang,
+            stmt.original_value,
+            target_value,
+            external_value,
+            stmt.first_seen,
+            stmt.last_seen,
+        ]
+        self._batch.append(row)
         if len(self._batch) >= CSV_BATCH:
             self.writer.writerows(self._batch)
             self._batch.clear()

diff --git a/nomenklatura/xref.py b/nomenklatura/xref.py
@@ -35,6 +35,7 @@ def xref(
     limit_factor: int = 10,
     scored: bool = True,
     external: bool = True,
+    discount_internal: float = 0.7,
     range: Optional[Schema] = None,
     auto_threshold: Optional[float] = None,
     conflicting_match_threshold: Optional[float] = None,
@@ -85,9 +86,8 @@ def xref(
             if conflict_reporter is not None:
                 conflict_reporter.check_match(result.score, left_id.id, right_id.id)
 
-            # Not sure this is globally a good idea.
             if len(left.datasets.intersection(right.datasets)) > 0:
-                score = score * 0.7
+                score = score * discount_internal
 
             if auto_threshold is not None and score > auto_threshold:
                 log.info("Auto-merge [%.2f]: %s <> %s", score, left, right)