Merge pull request #241 from medema-group/release/2.0.0-beta.3

Release/2.0.0 beta.3
medema-group · Dec 20, 2024 · 44a4cc7 · 44a4cc7
2 parents 94b567f + 0ec0c2b
commit 44a4cc7
Show file tree

Hide file tree

Showing 21 changed files with 248 additions and 163 deletions.
diff --git a/.github/workflows/badges.yml b/.github/workflows/badges.yml
@@ -5,34 +5,6 @@ on:
     branches:
       - master
 
-jobs:
-  run_pytest:
-    runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation')  }}
-    timeout-minutes: 15
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Set up mamba environment
-      uses: mamba-org/setup-micromamba@v1
-      with:
-        micromamba-version: '1.3.1-0'
-        environment-file: environment.yml
-        environment-name: BiG-SCAPE
-        init-shell: bash
-        generate-run-shell: true
-
-    - name: Install dependencies
-      shell: micromamba-shell {0}
-      run: |
-        python -m pip install pytest
-
-    - name: Test with Pytest
-      shell: micromamba-shell {0}
-      run: |
-        pytest
-
   generate_coverage:
     runs-on: ubuntu-latest
     if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation')  }}

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -0,0 +1,38 @@
+name: Run tests
+
+on:
+  push:
+    branches:
+      - master
+      - dev
+      - release/*
+      - feature/*
+      - hotfix/*
+
+jobs:
+  run_pytest:
+    runs-on: ubuntu-latest
+    if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation')  }}
+    timeout-minutes: 15
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up mamba environment
+      uses: mamba-org/setup-micromamba@v1
+      with:
+        micromamba-version: '1.3.1-0'
+        environment-file: environment.yml
+        environment-name: BiG-SCAPE
+        init-shell: bash
+        generate-run-shell: true
+
+    - name: Install dependencies
+      shell: micromamba-shell {0}
+      run: |
+        python -m pip install pytest
+
+    - name: Test with Pytest
+      shell: micromamba-shell {0}
+      run: |
+        pytest
diff --git a/README.md b/README.md
@@ -1,7 +1,8 @@
 ![License](https://img.shields.io/github/license/medema-group/BiG-SCAPE)
 ![Github downloads](https://img.shields.io/github/downloads/medema-group/BiG-SCAPE/latest/total?label=Github%20downloads%20%28latest%29)
 ![Conda downloads](https://img.shields.io/conda/dn/bioconda/bigscape?label=Conda%20downloads)
-![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/test.yml/badge.svg)
+![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/run-tests.yml/badge.svg)
+![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/deploy-docker.yml/badge.svg)
 ![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg)
 ![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg)
 

diff --git a/big_scape/cli/config.py b/big_scape/cli/config.py
@@ -165,7 +165,6 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
         """
         with open(config_file_path, "rb") as f:
             content = f.read()
-            BigscapeConfig.HASH = hashlib.sha256(content).hexdigest()
             config = yaml.load(content, Loader=yaml.FullLoader)
 
         # PROFILER
@@ -212,10 +211,36 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
                 legacy_classes[group] = set(classes)
         BigscapeConfig.LEGACY_ANTISMASH_CLASSES = legacy_classes
 
+        # store relevant hash
+        BigscapeConfig.generate_relevant_hash()
+
         # write config log
         if log_path is not None:
             BigscapeConfig.write_config_log(log_path, config)
 
+    @staticmethod
+    def generate_relevant_hash() -> None:
+        """Generates a config hash from values that might/will invalidate existing data"""
+        content = (
+            BigscapeConfig.MERGED_CAND_CLUSTER_TYPE,
+            BigscapeConfig.CDS_OVERLAP_CUTOFF,
+            BigscapeConfig.DOMAIN_OVERLAP_CUTOFF,
+            BigscapeConfig.REGION_MIN_LCS_LEN,
+            BigscapeConfig.PROTO_MIN_LCS_LEN,
+            BigscapeConfig.REGION_MIN_EXTEND_LEN,
+            BigscapeConfig.REGION_MIN_EXTEND_LEN_BIO,
+            BigscapeConfig.PROTO_MIN_EXTEND_LEN,
+            BigscapeConfig.NO_MIN_CLASSES,
+            BigscapeConfig.EXTEND_MATCH_SCORE,
+            BigscapeConfig.EXTEND_MISMATCH_SCORE,
+            BigscapeConfig.EXTEND_GAP_SCORE,
+            BigscapeConfig.EXTEND_MAX_MATCH_PERC,
+            BigscapeConfig.ANCHOR_DOMAINS,
+        )
+        BigscapeConfig.HASH = hashlib.sha256(
+            bytearray(str(content), "utf-8")
+        ).hexdigest()
+
     @staticmethod
     def write_config_log(log_path: Path, config: dict) -> None:
         """writes config log file

diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py
@@ -481,8 +481,8 @@ def check_config_hash():
 
         if latest_config and BigscapeConfig.HASH != latest_config:
             raise RuntimeError(
-                "Config file values have changed from the previous run! "
-                "Existing data is not guarenteed to be reusable, please "
+                "Relevant config file values have changed (see config.log) from the "
+                "previous run! Existing data is not guarenteed to be reusable, please "
                 "run with a fresh output directory/database."
             )
 

diff --git a/big_scape/genbank/candidate_cluster.py b/big_scape/genbank/candidate_cluster.py
@@ -7,6 +7,7 @@
 
 # from dependencies
 from Bio.SeqFeature import SeqFeature
+from sqlalchemy import Table, select
 
 # from other modules
 from big_scape.data import DB
@@ -169,7 +170,7 @@ def __repr__(self) -> str:
         return f"{self.parent_gbk} Candidate cluster {self.number} {self.nt_start}-{self.nt_stop} "
 
     @staticmethod
-    def load_all(region_dict: dict[int, Region]):
+    def load_all(region_dict: dict[int, Region], temp_gbk_id_table: Table = None):
         """Load all CandidateCluster objects from the database
 
         This function populates the CandidateCluster lists in the Regions provided in
@@ -198,10 +199,15 @@ def load_all(region_dict: dict[int, Region]):
                 record_table.c.product,
             )
             .where(record_table.c.record_type == "cand_cluster")
-            .where(record_table.c.parent_id.in_(region_dict.keys()))
-            .compile()
         )
 
+        if temp_gbk_id_table is not None:
+            candidate_cluster_select_query = candidate_cluster_select_query.where(
+                record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
+            )
+
+        candidate_cluster_select_query = candidate_cluster_select_query.compile()
+
         cursor_result = DB.execute(candidate_cluster_select_query)
 
         candidate_cluster_dict = {}
@@ -230,4 +236,4 @@ def load_all(region_dict: dict[int, Region]):
             # add to dictionary
             candidate_cluster_dict[result.id] = new_candidate_cluster
 
-        ProtoCluster.load_all(candidate_cluster_dict)
+        ProtoCluster.load_all(candidate_cluster_dict, temp_gbk_id_table)
diff --git a/big_scape/genbank/cds.py b/big_scape/genbank/cds.py
@@ -10,6 +10,7 @@
 from Bio.SeqFeature import SeqFeature
 from Bio.Seq import Seq
 from Bio import BiopythonWarning
+from sqlalchemy import Table, select
 
 # from other modules
 from big_scape.errors import InvalidGBKError
@@ -320,7 +321,7 @@ def len_nt_overlap(cds_a: CDS, cds_b: CDS) -> int:
         return max(0, right - left)
 
     @staticmethod
-    def load_all(gbk_dict: dict[int, GBK]) -> None:
+    def load_all(gbk_dict: dict[int, GBK], temp_gbk_id_table: Table = None) -> None:
         """Load all Region objects from the database
 
         This function populates the region objects in the GBKs provided in the input
@@ -349,10 +350,15 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
                 cds_table.c.aa_seq,
             )
             .order_by(cds_table.c.orf_num)
-            .where(cds_table.c.gbk_id.in_(gbk_dict.keys()))
-            .compile()
         )
 
+        if temp_gbk_id_table is not None:
+            region_select_query = region_select_query.where(
+                cds_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
+            )
+
+        region_select_query = region_select_query.compile()
+
         cursor_result = DB.execute(region_select_query)
 
         for result in cursor_result.all():

diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py
@@ -6,6 +6,8 @@
 
 # from enum import Enum
 from pathlib import Path
+import random
+import string
 from typing import Dict, Optional
 import hashlib
 
@@ -14,6 +16,7 @@
 from Bio import SeqIO
 from Bio.SeqRecord import SeqRecord
 from Bio.SeqFeature import SeqFeature
+from sqlalchemy import Column, ForeignKey, Integer, String, Table, select
 
 # from other modules
 from big_scape.errors import InvalidGBKError
@@ -34,6 +37,118 @@
 #     MIBIG = "mibig"
 #     REFERENCE = "reference"
 
+# TODO: generalize creating temp tables. this is copied from network.py
+
+
+def create_temp_hash_table(gbks: list[GBK]) -> Table:
+    """Create a temporary table with ids of given records
+
+    Args:
+        include_records (list[BGCRecord]): the records to include in the connected component
+
+    Returns:
+        Table: the temporary table
+    """
+
+    # generate a short random string
+    temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))
+
+    temp_table = Table(
+        temp_table_name,
+        DB.metadata,
+        Column(
+            "hash",
+            String,
+            ForeignKey(DB.metadata.tables["gbk"].c.hash),
+            primary_key=True,
+            nullable=False,
+        ),
+        prefixes=["TEMPORARY"],
+    )
+
+    DB.metadata.create_all(DB.engine)
+
+    if DB.engine is None:
+        raise RuntimeError("DB engine is None")
+
+    cursor = DB.engine.raw_connection().driver_connection.cursor()
+
+    insert_query = f"""
+        INSERT INTO {temp_table_name} (hash) VALUES (?);
+    """
+
+    def batch_hash(gbks: list[GBK], n: int):
+        l = len(gbks)
+        for ndx in range(0, l, n):
+            yield [gbk.hash for gbk in gbks[ndx : min(ndx + n, l)]]
+
+    for hash_batch in batch_hash(gbks, 1000):
+        cursor.executemany(insert_query, [(x,) for x in hash_batch])  # type: ignore
+
+    cursor.close()
+
+    DB.commit()
+
+    if DB.metadata is None:
+        raise ValueError("DB metadata is None")
+
+    return temp_table
+
+
+def create_temp_gbk_id_table(gbks: list[GBK]) -> Table:
+    """Create a temporary table with ids of given gbks
+
+    Args:
+        gbks (list[GBK]): the gbks to include in the connected component
+
+    Returns:
+        Table: the temporary table
+    """
+
+    # generate a short random string
+    temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))
+
+    temp_table = Table(
+        temp_table_name,
+        DB.metadata,
+        Column(
+            "gbk_id",
+            Integer,
+            ForeignKey(DB.metadata.tables["gbk"].c.id),
+            primary_key=True,
+            nullable=False,
+        ),
+        prefixes=["TEMPORARY"],
+    )
+
+    DB.metadata.create_all(DB.engine)
+
+    if DB.engine is None:
+        raise RuntimeError("DB engine is None")
+
+    cursor = DB.engine.raw_connection().driver_connection.cursor()
+
+    insert_query = f"""
+        INSERT INTO {temp_table_name} (gbk_id) VALUES (?);
+    """
+
+    def batch_hash(gbks: list[GBK], n: int):
+        l = len(gbks)
+        for ndx in range(0, l, n):
+            yield [gbk._db_id for gbk in gbks[ndx : min(ndx + n, l)]]
+
+    for hash_batch in batch_hash(gbks, 1000):
+        cursor.executemany(insert_query, [(x,) for x in hash_batch])  # type: ignore
+
+    cursor.close()
+
+    DB.commit()
+
+    if DB.metadata is None:
+        raise ValueError("DB metadata is None")
+
+    return temp_table
+
 
 class GBK:
     """
@@ -261,7 +376,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
             list[GBK]: loaded GBK objects
         """
 
-        input_gbk_hashes = [gbk.hash for gbk in input_gbks]
+        temp_hash_table = create_temp_hash_table(input_gbks)
 
         if not DB.metadata:
             raise RuntimeError("DB.metadata is None")
@@ -278,7 +393,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
                 gbk_table.c.taxonomy,
                 gbk_table.c.description,
             )
-            .where(gbk_table.c.hash.in_(input_gbk_hashes))
+            .where(gbk_table.c.hash.in_(select(temp_hash_table.c.hash)))
             .compile()
         )
 
@@ -297,9 +412,11 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
         # load GBK regions. This will also populate all record levels below region
         # e.g. candidate cluster, protocore if they exist
 
-        Region.load_all(gbk_dict)
+        temp_gbk_id_table = create_temp_gbk_id_table(input_gbks)
 
-        CDS.load_all(gbk_dict)
+        Region.load_all(gbk_dict, temp_gbk_id_table)
+
+        CDS.load_all(gbk_dict, temp_gbk_id_table)
 
         return list(gbk_dict.values())