From 6257fcb2e2f98ccd916e497a324a02d4e6aaaf8a Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 18 Dec 2024 11:23:38 +0100
Subject: [PATCH 01/15] config hash only relevant values

---
 big_scape/cli/config.py  | 27 ++++++++++++++++++++++++++-
 big_scape/data/sqlite.py |  4 ++--
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/big_scape/cli/config.py b/big_scape/cli/config.py
index d9ac2cb1..80d612f9 100644
--- a/big_scape/cli/config.py
+++ b/big_scape/cli/config.py
@@ -165,7 +165,6 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
         """
         with open(config_file_path, "rb") as f:
             content = f.read()
-            BigscapeConfig.HASH = hashlib.sha256(content).hexdigest()
             config = yaml.load(content, Loader=yaml.FullLoader)
 
         # PROFILER
@@ -212,10 +211,36 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
                 legacy_classes[group] = set(classes)
         BigscapeConfig.LEGACY_ANTISMASH_CLASSES = legacy_classes
 
+        # store relevant hash
+        BigscapeConfig.generate_relevant_hash()
+
         # write config log
         if log_path is not None:
             BigscapeConfig.write_config_log(log_path, config)
 
+    @staticmethod
+    def generate_relevant_hash() -> None:
+        """Generates a config hash from values that might/will invalidate existing data"""
+        content = (
+            BigscapeConfig.MERGED_CAND_CLUSTER_TYPE,
+            BigscapeConfig.CDS_OVERLAP_CUTOFF,
+            BigscapeConfig.DOMAIN_OVERLAP_CUTOFF,
+            BigscapeConfig.REGION_MIN_LCS_LEN,
+            BigscapeConfig.PROTO_MIN_LCS_LEN,
+            BigscapeConfig.REGION_MIN_EXTEND_LEN,
+            BigscapeConfig.REGION_MIN_EXTEND_LEN_BIO,
+            BigscapeConfig.PROTO_MIN_EXTEND_LEN,
+            BigscapeConfig.NO_MIN_CLASSES,
+            BigscapeConfig.EXTEND_MATCH_SCORE,
+            BigscapeConfig.EXTEND_MISMATCH_SCORE,
+            BigscapeConfig.EXTEND_GAP_SCORE,
+            BigscapeConfig.EXTEND_MAX_MATCH_PERC,
+            BigscapeConfig.ANCHOR_DOMAINS,
+        )
+        BigscapeConfig.HASH = hashlib.sha256(
+            bytearray(str(content), "utf-8")
+        ).hexdigest()
+
     @staticmethod
     def write_config_log(log_path: Path, config: dict) -> None:
         """writes config log file
diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py
index 2935c0a2..5c549278 100644
--- a/big_scape/data/sqlite.py
+++ b/big_scape/data/sqlite.py
@@ -481,8 +481,8 @@ def check_config_hash():
 
         if latest_config and BigscapeConfig.HASH != latest_config:
             raise RuntimeError(
-                "Config file values have changed from the previous run! "
-                "Existing data is not guarenteed to be reusable, please "
+                "Relevant config file values have changed (see config.log) from the "
+                "previous run! Existing data is not guarenteed to be reusable, please "
                 "run with a fresh output directory/database."
             )
 

From 5076608c7500be0dfec36c436876acaf0c5cce71 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 18 Dec 2024 13:40:58 +0100
Subject: [PATCH 02/15] swap from toml to tomllib

---
 big_scape/utility/version.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/big_scape/utility/version.py b/big_scape/utility/version.py
index df85de3a..daefdd02 100644
--- a/big_scape/utility/version.py
+++ b/big_scape/utility/version.py
@@ -1,8 +1,7 @@
 """Module that contains helper functions specifically related to the bigscape version
 """
 
-import toml
-
+import tomllib
 from importlib import metadata
 from pathlib import Path
 
@@ -21,7 +20,8 @@ def get_bigscape_version() -> str:
     pyproject_toml = Path(__file__).parent.parent.parent / "pyproject.toml"
 
     if pyproject_toml.exists():
-        return toml.load(pyproject_toml)["project"]["version"]
+        with open(pyproject_toml, "rb") as fp:
+            return tomllib.load(fp)["project"]["version"]
 
     # if not, we're probably running as a package. get the version of the currently
     # installed big-scape package

From 56d092d035824c383876c25bd0369de25a607bfd Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 18 Dec 2024 13:41:54 +0100
Subject: [PATCH 03/15] change to lowercase folder name

---
 ...SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png | Bin
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {Figures => figures}/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png (100%)

diff --git a/Figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png b/figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png
similarity index 100%
rename from Figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png
rename to figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png

From 848f7479dc2cc263be9a07b7e7642888b9c54766 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 18 Dec 2024 13:52:43 +0100
Subject: [PATCH 04/15] add missing test __init__.py

---
 test/comparison/__init__.py | 1 +
 test/hmm/__init__.py        | 1 +
 test/network/__init__.py    | 1 +
 3 files changed, 3 insertions(+)
 create mode 100644 test/comparison/__init__.py
 create mode 100644 test/hmm/__init__.py
 create mode 100644 test/network/__init__.py

diff --git a/test/comparison/__init__.py b/test/comparison/__init__.py
new file mode 100644
index 00000000..03dc8e43
--- /dev/null
+++ b/test/comparison/__init__.py
@@ -0,0 +1 @@
+"""Contains tests for distance calculation"""
diff --git a/test/hmm/__init__.py b/test/hmm/__init__.py
new file mode 100644
index 00000000..c6dfb8cc
--- /dev/null
+++ b/test/hmm/__init__.py
@@ -0,0 +1 @@
+"""Contains tests involving (py)HMMer analysis and processing"""
diff --git a/test/network/__init__.py b/test/network/__init__.py
new file mode 100644
index 00000000..8f435c38
--- /dev/null
+++ b/test/network/__init__.py
@@ -0,0 +1 @@
+"""Contains tests involving CC and family generation"""

From cd3b2f21b1f7c81afaca5f7834ec9c6b80628a2e Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 18 Dec 2024 14:01:46 +0100
Subject: [PATCH 05/15] remove unused networkx dependency

---
 big_scape/network/families.py | 61 -----------------------------------
 big_scape/network/utility.py  | 17 ----------
 environment.yml               |  1 -
 pyproject.toml                |  1 -
 4 files changed, 80 deletions(-)

diff --git a/big_scape/network/families.py b/big_scape/network/families.py
index 10a54ba4..3f3bd15e 100644
--- a/big_scape/network/families.py
+++ b/big_scape/network/families.py
@@ -4,9 +4,6 @@
 import sys
 from typing import Callable, Optional
 import warnings
-import numpy as np
-import networkx
-import math
 import logging
 
 # from dependencies
@@ -105,24 +102,6 @@ def generate_families(
     return regions_families
 
 
-def get_cc_edge_weight_std(connected_component) -> float:
-    """calculates the standard deviation of the edge weights of a connected component
-
-    Args:
-        connected_component (list[tuple[int, int, float, float, float, float, str]]):
-            connected component in the form of a list of edges
-
-    Returns:
-        float: standard deviation of the edge weights of the connected component
-    """
-
-    edge_weights = [edge[2] for edge in connected_component]
-    edge_std = np.std(edge_weights)
-    edge_std = round(edge_std, 2)
-
-    return edge_std
-
-
 def get_cc_density(
     connected_component: list[tuple[int, int, float, float, float, float, int]]
 ) -> float:
@@ -148,46 +127,6 @@ def get_cc_density(
     return cc_density
 
 
-def test_centrality(connected_component, node_fraction) -> tuple[bool, list[int]]:
-    """tests if a network will break when removing the top nodes
-    with highest betweenness centrality
-
-    Args:
-        connected_component (list[tuple[int, int, float, float, float, float, str]]):
-            connected component in the form of a list of edges
-        node_fraction (float): fraction of nodes with highest betweenness centrality to remove
-
-    Returns:
-        tuple[bool, list[int]]: whether the network breaks and the list of nodes sorted by betweenness centrality
-    """
-
-    edgelist = [(edge[0], edge[1], edge[2]) for edge in connected_component]
-
-    graph = networkx.Graph()
-    graph.add_weighted_edges_from(edgelist)
-
-    betweeness_centrality_dict = networkx.betweenness_centrality(graph)
-    sorted_between_bentrality_nodes = sorted(
-        betweeness_centrality_dict, key=betweeness_centrality_dict.get, reverse=True
-    )
-
-    # round up to nearest integer
-    top_nodes = math.ceil(len(sorted_between_bentrality_nodes) * node_fraction)
-    nodes_to_remove = sorted_between_bentrality_nodes[:top_nodes]
-
-    for node in nodes_to_remove:
-        graph.remove_node(node)
-
-    nr_ccs = networkx.number_connected_components(graph)
-
-    del graph
-
-    if nr_ccs > 1:
-        return True, sorted_between_bentrality_nodes
-
-    return False, sorted_between_bentrality_nodes
-
-
 def aff_sim_matrix(matrix, preference: Optional[float] = None):
     """Execute affinity propagation on a __similarity__ matrix
 
diff --git a/big_scape/network/utility.py b/big_scape/network/utility.py
index ab3a9dfd..284c544d 100644
--- a/big_scape/network/utility.py
+++ b/big_scape/network/utility.py
@@ -2,23 +2,6 @@
 
 # from dependencies
 import numpy as np
-import networkx as nx
-
-
-def sim_matrix_from_graph(graph: nx.Graph, edge_property: str) -> np.ndarray:
-    """Return a similarity matrix from a graph in the form of a numpy array
-
-    Args:
-        graph (Graph): graph
-        edge_property (str): _description_
-
-    Returns:
-        ndarray: _description_
-    """
-    matrix = nx.to_numpy_array(graph, weight=edge_property, nonedge=1.0)
-    # have to convert from distances to similarity
-    matrix = 1 - matrix
-    return matrix
 
 
 def edge_list_to_adj_list(
diff --git a/environment.yml b/environment.yml
index 1a86722f..e668390c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,7 +7,6 @@ dependencies:
   - biopython=1.81
   - sortedcontainers=2.4.0
   - fasttree=2.1.11
-  - networkx=3.1
   - numpy=1.26.0
   - pandas=2.1.1
   - pyhmmer=0.10.14
diff --git a/pyproject.toml b/pyproject.toml
index a68e0a86..2a3e1506 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,6 @@ dev = [
 
     # type stubs (https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports)
     "types-psutil",
-    "networkx-stubs",
     "data-science-types",
     "types-tqdm",
     "types-setuptools"

From a6010774c1cf510892c95d6c46f2f58f5a4a1775 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 18 Dec 2024 14:08:21 +0100
Subject: [PATCH 06/15] fix tests

---
 test/network/test_family.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/test/network/test_family.py b/test/network/test_family.py
index 95e904d2..07c0bcbe 100644
--- a/test/network/test_family.py
+++ b/test/network/test_family.py
@@ -121,18 +121,6 @@ def test_aff_sim_matrix(self):
 
         self.assertListEqual(expected_labels, actual_labels)
 
-    def test_get_cc_edge_weight_std(self):
-        """Tests whether the standard deviation of the edge weights of a connected
-        component is correctly calculated
-        """
-        adj_list = TestAffinityPropagation.gen_edge_list()
-
-        expected_std = 0.12
-
-        actual_std = bs_families.get_cc_edge_weight_std(adj_list)
-
-        self.assertEqual(expected_std, actual_std)
-
     def test_get_cc_density(self):
         """Tests whether the density of a connected component is correctly
         calculated
@@ -146,17 +134,3 @@ def test_get_cc_density(self):
         actual_density = bs_families.get_cc_density(adj_list)
 
         self.assertEqual(expected_density, actual_density)
-
-    def test_test_centrality(self):
-        """Tests whether the test_centrality function correctly identifies a network
-        that will break when removing the top nodes with highest betweenness centrality
-        """
-        adj_list = TestAffinityPropagation.gen_edge_list_alt()
-
-        expected_break = True
-
-        actual_break, actual_sorted_centrality_nodes = bs_families.test_centrality(
-            adj_list, 0.3
-        )
-
-        self.assertEqual(expected_break, actual_break)

From a8ef76f5c2a4752c33f7703048e55e38cfa56bb0 Mon Sep 17 00:00:00 2001
From: Arjan Draisma <arjan.draisma@wur.nl>
Date: Thu, 19 Dec 2024 09:14:17 +0100
Subject: [PATCH 07/15] split tests from badges yml

---
 .github/workflows/badges.yml    | 28 -------------------------
 .github/workflows/run-tests.yml | 36 +++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 28 deletions(-)
 create mode 100644 .github/workflows/run-tests.yml

diff --git a/.github/workflows/badges.yml b/.github/workflows/badges.yml
index 6bcfed64..ee2e3be0 100644
--- a/.github/workflows/badges.yml
+++ b/.github/workflows/badges.yml
@@ -5,34 +5,6 @@ on:
     branches:
       - master
 
-jobs:
-  run_pytest:
-    runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation')  }}
-    timeout-minutes: 15
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Set up mamba environment
-      uses: mamba-org/setup-micromamba@v1
-      with:
-        micromamba-version: '1.3.1-0'
-        environment-file: environment.yml
-        environment-name: BiG-SCAPE
-        init-shell: bash
-        generate-run-shell: true
-
-    - name: Install dependencies
-      shell: micromamba-shell {0}
-      run: |
-        python -m pip install pytest
-
-    - name: Test with Pytest
-      shell: micromamba-shell {0}
-      run: |
-        pytest
-
   generate_coverage:
     runs-on: ubuntu-latest
     if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation')  }}
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
new file mode 100644
index 00000000..589e11b9
--- /dev/null
+++ b/.github/workflows/run-tests.yml
@@ -0,0 +1,36 @@
+name: Run tests
+
+on:
+  push:
+    branches:
+      - master
+      - dev
+      - release/*
+
+jobs:
+  run_pytest:
+    runs-on: ubuntu-latest
+    if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation')  }}
+    timeout-minutes: 15
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up mamba environment
+      uses: mamba-org/setup-micromamba@v1
+      with:
+        micromamba-version: '1.3.1-0'
+        environment-file: environment.yml
+        environment-name: BiG-SCAPE
+        init-shell: bash
+        generate-run-shell: true
+
+    - name: Install dependencies
+      shell: micromamba-shell {0}
+      run: |
+        python -m pip install pytest
+
+    - name: Test with Pytest
+      shell: micromamba-shell {0}
+      run: |
+        pytest

From b05cf989a23758f8d954ba51b5e810cd20146307 Mon Sep 17 00:00:00 2001
From: Arjan Draisma <arjan.draisma@wur.nl>
Date: Thu, 19 Dec 2024 09:17:57 +0100
Subject: [PATCH 08/15] add more branch types

---
 .github/workflows/run-tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 589e11b9..589e63fc 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -6,6 +6,8 @@ on:
       - master
       - dev
       - release/*
+      - feature/*
+      - hotfix/*
 
 jobs:
   run_pytest:

From 95714b085ce788a033401a746fcd7f49ad136819 Mon Sep 17 00:00:00 2001
From: Arjan Draisma <arjan.draisma@wur.nl>
Date: Fri, 20 Dec 2024 11:03:37 +0100
Subject: [PATCH 09/15] fix pytest badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 621749eb..7fac44d8 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 ![License](https://img.shields.io/github/license/medema-group/BiG-SCAPE)
 ![Github downloads](https://img.shields.io/github/downloads/medema-group/BiG-SCAPE/latest/total?label=Github%20downloads%20%28latest%29)
 ![Conda downloads](https://img.shields.io/conda/dn/bioconda/bigscape?label=Conda%20downloads)
-![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/test.yml/badge.svg)
+![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/run-tests.yml/badge.svg)
 ![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg)
 ![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg)
 

From 3d5cad0dba4f16d556d1f7ffc9bb5eddfd30f029 Mon Sep 17 00:00:00 2001
From: Arjan Draisma <arjan.draisma@wur.nl>
Date: Fri, 20 Dec 2024 11:03:45 +0100
Subject: [PATCH 10/15] add docker image upload badge

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7fac44d8..633172e3 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 ![Github downloads](https://img.shields.io/github/downloads/medema-group/BiG-SCAPE/latest/total?label=Github%20downloads%20%28latest%29)
 ![Conda downloads](https://img.shields.io/conda/dn/bioconda/bigscape?label=Conda%20downloads)
 ![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/run-tests.yml/badge.svg)
+![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/deploy-docker.yml/badge.svg)
 ![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg)
 ![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg)
 

From 032094fbf4bd72c7eaa27ff2317837b5e62cf2c0 Mon Sep 17 00:00:00 2001
From: Arjan Draisma <arjan.draisma@wur.nl>
Date: Tue, 15 Oct 2024 16:36:27 +0200
Subject: [PATCH 11/15] implement temp table for hashes

---
 big_scape/genbank/gbk.py | 80 ++++++++++++++++++++++++++++++++++------
 1 file changed, 69 insertions(+), 11 deletions(-)

diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py
index 3b1e01ee..8c6c764a 100644
--- a/big_scape/genbank/gbk.py
+++ b/big_scape/genbank/gbk.py
@@ -6,6 +6,8 @@
 
 # from enum import Enum
 from pathlib import Path
+import random
+import string
 from typing import Dict, Optional
 import hashlib
 
@@ -14,6 +16,7 @@
 from Bio import SeqIO
 from Bio.SeqRecord import SeqRecord
 from Bio.SeqFeature import SeqFeature
+from sqlalchemy import Column, ForeignKey, Integer, String, Table, select
 
 # from other modules
 from big_scape.errors import InvalidGBKError
@@ -34,6 +37,57 @@
 #     MIBIG = "mibig"
 #     REFERENCE = "reference"
 
+# TODO: generalize creating temp tables. this is copied from network.py
+
+
+def create_temp_hash_table(hashes: list[str]) -> Table:
+    """Create a temporary table with ids of given records
+
+    Args:
+        include_records (list[BGCRecord]): the records to include in the connected component
+
+    Returns:
+        Table: the temporary table
+    """
+
+    # generate a short random string
+    temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))
+
+    temp_table = Table(
+        temp_table_name,
+        DB.metadata,
+        Column(
+            "hash",
+            String,
+            ForeignKey(DB.metadata.tables["gbk"].c.hash),
+            primary_key=True,
+            nullable=False,
+        ),
+        prefixes=["TEMPORARY"],
+    )
+
+    DB.metadata.create_all(DB.engine)
+
+    if DB.engine is None:
+        raise RuntimeError("DB engine is None")
+
+    cursor = DB.engine.raw_connection().driver_connection.cursor()
+
+    insert_query = f"""
+        INSERT INTO {temp_table_name} (hash) VALUES (?);
+    """
+
+    cursor.executemany(insert_query, [(x,) for x in hashes])  # type: ignore
+
+    cursor.close()
+
+    DB.commit()
+
+    if DB.metadata is None:
+        raise ValueError("DB metadata is None")
+
+    return temp_table
+
 
 class GBK:
     """
@@ -261,7 +315,11 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
             list[GBK]: loaded GBK objects
         """
 
-        input_gbk_hashes = [gbk.hash for gbk in input_gbks]
+        def iter_hashes():
+            for gbk in input_gbks:
+                yield gbk.hash
+
+        temp_hash_table = create_temp_hash_table(iter_hashes())
 
         if not DB.metadata:
             raise RuntimeError("DB.metadata is None")
@@ -278,7 +336,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
                 gbk_table.c.taxonomy,
                 gbk_table.c.description,
             )
-            .where(gbk_table.c.hash.in_(input_gbk_hashes))
+            .where(gbk_table.c.hash.in_(select(temp_hash_table.c.hash)))
             .compile()
         )
 
@@ -635,15 +693,15 @@ def collapse_hybrids_in_cand_clusters(
                     for number in cand_cluster.proto_clusters.keys()
                 ]
                 merged_protocluster = MergedProtoCluster.merge(protoclusters)
-                merged_tmp_proto_clusters[
-                    merged_protocluster.number
-                ] = merged_protocluster
+                merged_tmp_proto_clusters[merged_protocluster.number] = (
+                    merged_protocluster
+                )
 
                 # update the protocluster old:new ids for the merged protoclusters of this cand_cluster
                 for proto_cluster_num in cand_cluster.proto_clusters.keys():
-                    merged_protocluster_ids[
-                        proto_cluster_num
-                    ] = merged_protocluster.number
+                    merged_protocluster_ids[proto_cluster_num] = (
+                        merged_protocluster.number
+                    )
 
         # now we build a new version of the tmp_proto_clusters dict that contains the merged protoclusters
         # as well as protoclusters which did not need merging, with updated unique IDs/numbers
@@ -657,9 +715,9 @@ def collapse_hybrids_in_cand_clusters(
                     # this protocluster has been merged, so we need to add it to
                     # the dict with its new protocluster number
                     new_proto_cluster_num = merged_protocluster_ids[proto_cluster_num]
-                    updated_tmp_proto_clusters[
-                        new_proto_cluster_num
-                    ] = merged_tmp_proto_clusters[new_proto_cluster_num]
+                    updated_tmp_proto_clusters[new_proto_cluster_num] = (
+                        merged_tmp_proto_clusters[new_proto_cluster_num]
+                    )
                     updated_proto_cluster_dict[new_proto_cluster_num] = None
                 else:
                     # protoclusters which have not been merged are added to the dict as is

From 441b668ff8eecf762f6edf96d2762b4c475b69d7 Mon Sep 17 00:00:00 2001
From: Arjan Draisma <arjan.draisma@wur.nl>
Date: Wed, 16 Oct 2024 10:40:48 +0200
Subject: [PATCH 12/15] batch hash insertions

---
 big_scape/genbank/gbk.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py
index 8c6c764a..2febd7e3 100644
--- a/big_scape/genbank/gbk.py
+++ b/big_scape/genbank/gbk.py
@@ -40,7 +40,7 @@
 # TODO: generalize creating temp tables. this is copied from network.py
 
 
-def create_temp_hash_table(hashes: list[str]) -> Table:
+def create_temp_hash_table(gbks: list[GBK]) -> Table:
     """Create a temporary table with ids of given records
 
     Args:
@@ -77,7 +77,13 @@ def create_temp_hash_table(hashes: list[str]) -> Table:
         INSERT INTO {temp_table_name} (hash) VALUES (?);
     """
 
-    cursor.executemany(insert_query, [(x,) for x in hashes])  # type: ignore
+    def batch_hash(gbks: list[GBK], n: int):
+        l = len(gbks)
+        for ndx in range(0, l, n):
+            yield [gbk.hash for gbk in gbks[ndx : min(ndx + n, l)]]
+
+    for hash_batch in batch_hash(gbks, 1000):
+        cursor.executemany(insert_query, [(x,) for x in hash_batch])  # type: ignore
 
     cursor.close()
 
@@ -315,11 +321,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
             list[GBK]: loaded GBK objects
         """
 
-        def iter_hashes():
-            for gbk in input_gbks:
-                yield gbk.hash
-
-        temp_hash_table = create_temp_hash_table(iter_hashes())
+        temp_hash_table = create_temp_hash_table(input_gbks)
 
         if not DB.metadata:
             raise RuntimeError("DB.metadata is None")

From 15f1c9402959ff6df316e74c0c16d5119e534ea1 Mon Sep 17 00:00:00 2001
From: Arjan Draisma <arjan.draisma@wur.nl>
Date: Fri, 20 Dec 2024 15:32:31 +0100
Subject: [PATCH 13/15] use a temp table for gbk ids in record loading

---
 big_scape/genbank/candidate_cluster.py | 14 +++--
 big_scape/genbank/cds.py               | 12 +++-
 big_scape/genbank/gbk.py               | 79 ++++++++++++++++++++++----
 big_scape/genbank/proto_cluster.py     | 15 ++++-
 big_scape/genbank/proto_core.py        | 14 ++++-
 big_scape/genbank/region.py            | 14 +++--
 6 files changed, 120 insertions(+), 28 deletions(-)

diff --git a/big_scape/genbank/candidate_cluster.py b/big_scape/genbank/candidate_cluster.py
index 1e0c64cb..e1626023 100644
--- a/big_scape/genbank/candidate_cluster.py
+++ b/big_scape/genbank/candidate_cluster.py
@@ -7,6 +7,7 @@
 
 # from dependencies
 from Bio.SeqFeature import SeqFeature
+from sqlalchemy import Table, select
 
 # from other modules
 from big_scape.data import DB
@@ -169,7 +170,7 @@ def __repr__(self) -> str:
         return f"{self.parent_gbk} Candidate cluster {self.number} {self.nt_start}-{self.nt_stop} "
 
     @staticmethod
-    def load_all(region_dict: dict[int, Region]):
+    def load_all(region_dict: dict[int, Region], temp_gbk_id_table: Table = None):
         """Load all CandidateCluster objects from the database
 
         This function populates the CandidateCluster lists in the Regions provided in
@@ -198,10 +199,15 @@ def load_all(region_dict: dict[int, Region]):
                 record_table.c.product,
             )
             .where(record_table.c.record_type == "cand_cluster")
-            .where(record_table.c.parent_id.in_(region_dict.keys()))
-            .compile()
         )
 
+        if temp_gbk_id_table is not None:
+            candidate_cluster_select_query = candidate_cluster_select_query.where(
+                record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
+            )
+
+        candidate_cluster_select_query = candidate_cluster_select_query.compile()
+
         cursor_result = DB.execute(candidate_cluster_select_query)
 
         candidate_cluster_dict = {}
@@ -230,4 +236,4 @@ def load_all(region_dict: dict[int, Region]):
             # add to dictionary
             candidate_cluster_dict[result.id] = new_candidate_cluster
 
-        ProtoCluster.load_all(candidate_cluster_dict)
+        ProtoCluster.load_all(candidate_cluster_dict, temp_gbk_id_table)
diff --git a/big_scape/genbank/cds.py b/big_scape/genbank/cds.py
index 752845c6..e91aeae1 100644
--- a/big_scape/genbank/cds.py
+++ b/big_scape/genbank/cds.py
@@ -10,6 +10,7 @@
 from Bio.SeqFeature import SeqFeature
 from Bio.Seq import Seq
 from Bio import BiopythonWarning
+from sqlalchemy import Table, select
 
 # from other modules
 from big_scape.errors import InvalidGBKError
@@ -320,7 +321,7 @@ def len_nt_overlap(cds_a: CDS, cds_b: CDS) -> int:
         return max(0, right - left)
 
     @staticmethod
-    def load_all(gbk_dict: dict[int, GBK]) -> None:
+    def load_all(gbk_dict: dict[int, GBK], temp_gbk_id_table: Table = None) -> None:
         """Load all Region objects from the database
 
         This function populates the region objects in the GBKs provided in the input
@@ -349,10 +350,15 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
                 cds_table.c.aa_seq,
             )
             .order_by(cds_table.c.orf_num)
-            .where(cds_table.c.gbk_id.in_(gbk_dict.keys()))
-            .compile()
         )
 
+        if temp_gbk_id_table is not None:
+            region_select_query = region_select_query.where(
+                cds_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
+            )
+
+        region_select_query = region_select_query.compile()
+
         cursor_result = DB.execute(region_select_query)
 
         for result in cursor_result.all():
diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py
index 2febd7e3..a6844fc3 100644
--- a/big_scape/genbank/gbk.py
+++ b/big_scape/genbank/gbk.py
@@ -95,6 +95,61 @@ def batch_hash(gbks: list[GBK], n: int):
     return temp_table
 
 
+def create_temp_gbk_id_table(gbks: list[GBK]) -> Table:
+    """Create a temporary table with ids of given gbks
+
+    Args:
+        gbks (list[GBK]): the gbks to include in the connected component
+
+    Returns:
+        Table: the temporary table
+    """
+
+    # generate a short random string
+    temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))
+
+    temp_table = Table(
+        temp_table_name,
+        DB.metadata,
+        Column(
+            "gbk_id",
+            Integer,
+            ForeignKey(DB.metadata.tables["gbk"].c.id),
+            primary_key=True,
+            nullable=False,
+        ),
+        prefixes=["TEMPORARY"],
+    )
+
+    DB.metadata.create_all(DB.engine)
+
+    if DB.engine is None:
+        raise RuntimeError("DB engine is None")
+
+    cursor = DB.engine.raw_connection().driver_connection.cursor()
+
+    insert_query = f"""
+        INSERT INTO {temp_table_name} (gbk_id) VALUES (?);
+    """
+
+    def batch_hash(gbks: list[GBK], n: int):
+        l = len(gbks)
+        for ndx in range(0, l, n):
+            yield [gbk._db_id for gbk in gbks[ndx : min(ndx + n, l)]]
+
+    for hash_batch in batch_hash(gbks, 1000):
+        cursor.executemany(insert_query, [(x,) for x in hash_batch])  # type: ignore
+
+    cursor.close()
+
+    DB.commit()
+
+    if DB.metadata is None:
+        raise ValueError("DB metadata is None")
+
+    return temp_table
+
+
 class GBK:
     """
     Class to describe a given GBK file
@@ -357,9 +412,11 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
         # load GBK regions. This will also populate all record levels below region
         # e.g. candidate cluster, protocore if they exist
 
-        Region.load_all(gbk_dict)
+        temp_gbk_id_table = create_temp_gbk_id_table(input_gbks)
 
-        CDS.load_all(gbk_dict)
+        Region.load_all(gbk_dict, temp_gbk_id_table)
+
+        CDS.load_all(gbk_dict, temp_gbk_id_table)
 
         return list(gbk_dict.values())
 
@@ -695,15 +752,15 @@ def collapse_hybrids_in_cand_clusters(
                     for number in cand_cluster.proto_clusters.keys()
                 ]
                 merged_protocluster = MergedProtoCluster.merge(protoclusters)
-                merged_tmp_proto_clusters[merged_protocluster.number] = (
-                    merged_protocluster
-                )
+                merged_tmp_proto_clusters[
+                    merged_protocluster.number
+                ] = merged_protocluster
 
                 # update the protocluster old:new ids for the merged protoclusters of this cand_cluster
                 for proto_cluster_num in cand_cluster.proto_clusters.keys():
-                    merged_protocluster_ids[proto_cluster_num] = (
-                        merged_protocluster.number
-                    )
+                    merged_protocluster_ids[
+                        proto_cluster_num
+                    ] = merged_protocluster.number
 
         # now we build a new version of the tmp_proto_clusters dict that contains the merged protoclusters
         # as well as protoclusters which did not need merging, with updated unique IDs/numbers
@@ -717,9 +774,9 @@ def collapse_hybrids_in_cand_clusters(
                     # this protocluster has been merged, so we need to add it to
                     # the dict with its new protocluster number
                     new_proto_cluster_num = merged_protocluster_ids[proto_cluster_num]
-                    updated_tmp_proto_clusters[new_proto_cluster_num] = (
-                        merged_tmp_proto_clusters[new_proto_cluster_num]
-                    )
+                    updated_tmp_proto_clusters[
+                        new_proto_cluster_num
+                    ] = merged_tmp_proto_clusters[new_proto_cluster_num]
                     updated_proto_cluster_dict[new_proto_cluster_num] = None
                 else:
                     # protoclusters which have not been merged are added to the dict as is
diff --git a/big_scape/genbank/proto_cluster.py b/big_scape/genbank/proto_cluster.py
index 8a1dbf6e..53c6cdfd 100644
--- a/big_scape/genbank/proto_cluster.py
+++ b/big_scape/genbank/proto_cluster.py
@@ -7,6 +7,7 @@
 
 # from dependencies
 from Bio.SeqFeature import SeqFeature
+from sqlalchemy import Table, select
 
 # from other modules
 from big_scape.data import DB
@@ -179,7 +180,10 @@ def __repr__(self) -> str:
         return f"{self.parent_gbk} ProtoCluster {self.number} {self.nt_start}-{self.nt_stop} "
 
     @staticmethod
-    def load_all(candidate_cluster_dict: dict[int, CandidateCluster]):
+    def load_all(
+        candidate_cluster_dict: dict[int, CandidateCluster],
+        temp_gbk_id_table: Table = None,
+    ):
         """Load all ProtoCluster objects from the database
 
         This function populates the CandidateCluster objects in the GBKs provided in the
@@ -210,10 +214,15 @@ def load_all(candidate_cluster_dict: dict[int, CandidateCluster]):
                 record_table.c.merged,
             )
             .where(record_table.c.record_type == "protocluster")
-            .where(record_table.c.parent_id.in_(candidate_cluster_dict.keys()))
-            .compile()
         )
 
+        if temp_gbk_id_table is not None:
+            protocluster_select_query = protocluster_select_query.where(
+                record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
+            )
+
+        protocluster_select_query = protocluster_select_query.compile()
+
         cursor_result = DB.execute(protocluster_select_query)
 
         protocluster_dict = {}
diff --git a/big_scape/genbank/proto_core.py b/big_scape/genbank/proto_core.py
index b5cc6481..090d9b2c 100644
--- a/big_scape/genbank/proto_core.py
+++ b/big_scape/genbank/proto_core.py
@@ -7,6 +7,7 @@
 
 # from dependencies
 from Bio.SeqFeature import SeqFeature
+from sqlalchemy import Table, select
 
 # from other modules
 from big_scape.data import DB
@@ -110,7 +111,9 @@ def __repr__(self) -> str:
         )
 
     @staticmethod
-    def load_all(protocluster_dict: dict[int, ProtoCluster]):
+    def load_all(
+        protocluster_dict: dict[int, ProtoCluster], temp_gbk_id_table: Table = None
+    ):
         """Load all ProtoCore objects from the database
 
         This function populates the region objects in the GBKs provided in the input
@@ -141,10 +144,15 @@ def load_all(protocluster_dict: dict[int, ProtoCluster]):
                 record_table.c.merged,
             )
             .where(record_table.c.record_type == "proto_core")
-            .where(record_table.c.parent_id.in_(protocluster_dict.keys()))
-            .compile()
         )
 
+        if temp_gbk_id_table is not None:
+            region_select_query = region_select_query.where(
+                record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
+            )
+
+        region_select_query = region_select_query.compile()
+
         cursor_result = DB.execute(region_select_query)
 
         for result in cursor_result.all():
diff --git a/big_scape/genbank/region.py b/big_scape/genbank/region.py
index 7b7c6d45..3cd85ba7 100644
--- a/big_scape/genbank/region.py
+++ b/big_scape/genbank/region.py
@@ -8,6 +8,7 @@
 # from dependencies
 from Bio.SeqFeature import SeqFeature
 from Bio.SeqRecord import SeqRecord
+from sqlalchemy import Table, select
 
 # from other modules
 from big_scape.data import DB
@@ -262,7 +263,7 @@ def __repr__(self):
         return f"{self.parent_gbk} Region {self.number} {self.nt_start}-{self.nt_stop} "
 
     @staticmethod
-    def load_all(gbk_dict: dict[int, GBK]) -> None:
+    def load_all(gbk_dict: dict[int, GBK], temp_gbk_id_table: Table = None) -> None:
         """Load all Region objects from the database
 
         This function populates the region objects in the GBKs provided in the input
@@ -292,10 +293,15 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
                 record_table.c.product,
             )
             .where(record_table.c.record_type == "region")
-            .where(record_table.c.gbk_id.in_(gbk_dict.keys()))
-            .compile()
         )
 
+        if temp_gbk_id_table is not None:
+            region_select_query = region_select_query.where(
+                record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
+            )
+
+        region_select_query = region_select_query.compile()
+
         cursor_result = DB.execute(region_select_query)
 
         region_dict = {}
@@ -320,4 +326,4 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
             # add to dictionary
             region_dict[result.id] = new_region
 
-        CandidateCluster.load_all(region_dict)
+        CandidateCluster.load_all(region_dict, temp_gbk_id_table)

From 645c9867445a2ab8d9e9249437d3e25539562f68 Mon Sep 17 00:00:00 2001
From: adraismawur <adraismawur@users.noreply.github.com>
Date: Mon, 16 Dec 2024 14:58:22 +0000
Subject: [PATCH 14/15] Update version number to new_version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2a3e1506..b7898b9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name ="big-scape"
-version = "release/2.0.0"
+version = "2.0.0-beta.2"
 description = "Biosynthetic Gene Similarity Clustering and Prospecting Engine"
 requires-python = ">=3.11"
 license = { file = "LICENSE" }

From 1912a4cfefef82a0d86e6a7c90ed4d6f1e204444 Mon Sep 17 00:00:00 2001
From: adraismawur <adraismawur@users.noreply.github.com>
Date: Fri, 20 Dec 2024 17:25:31 +0000
Subject: [PATCH 15/15] Update version number to new_version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b7898b9c..baa2ff18 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name ="big-scape"
-version = "2.0.0-beta.2"
+version = "2.0.0-beta.3"
 description = "Biosynthetic Gene Similarity Clustering and Prospecting Engine"
 requires-python = ">=3.11"
 license = { file = "LICENSE" }