From 6257fcb2e2f98ccd916e497a324a02d4e6aaaf8a Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 18 Dec 2024 11:23:38 +0100 Subject: [PATCH 01/15] config hash only relevant values --- big_scape/cli/config.py | 27 ++++++++++++++++++++++++++- big_scape/data/sqlite.py | 4 ++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/big_scape/cli/config.py b/big_scape/cli/config.py index d9ac2cb1..80d612f9 100644 --- a/big_scape/cli/config.py +++ b/big_scape/cli/config.py @@ -165,7 +165,6 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non """ with open(config_file_path, "rb") as f: content = f.read() - BigscapeConfig.HASH = hashlib.sha256(content).hexdigest() config = yaml.load(content, Loader=yaml.FullLoader) # PROFILER @@ -212,10 +211,36 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non legacy_classes[group] = set(classes) BigscapeConfig.LEGACY_ANTISMASH_CLASSES = legacy_classes + # store relevant hash + BigscapeConfig.generate_relevant_hash() + # write config log if log_path is not None: BigscapeConfig.write_config_log(log_path, config) + @staticmethod + def generate_relevant_hash() -> None: + """Generates a config hash from values that might/will invalidate existing data""" + content = ( + BigscapeConfig.MERGED_CAND_CLUSTER_TYPE, + BigscapeConfig.CDS_OVERLAP_CUTOFF, + BigscapeConfig.DOMAIN_OVERLAP_CUTOFF, + BigscapeConfig.REGION_MIN_LCS_LEN, + BigscapeConfig.PROTO_MIN_LCS_LEN, + BigscapeConfig.REGION_MIN_EXTEND_LEN, + BigscapeConfig.REGION_MIN_EXTEND_LEN_BIO, + BigscapeConfig.PROTO_MIN_EXTEND_LEN, + BigscapeConfig.NO_MIN_CLASSES, + BigscapeConfig.EXTEND_MATCH_SCORE, + BigscapeConfig.EXTEND_MISMATCH_SCORE, + BigscapeConfig.EXTEND_GAP_SCORE, + BigscapeConfig.EXTEND_MAX_MATCH_PERC, + BigscapeConfig.ANCHOR_DOMAINS, + ) + BigscapeConfig.HASH = hashlib.sha256( + bytearray(str(content), "utf-8") + ).hexdigest() + @staticmethod def write_config_log(log_path: Path, config: dict) -> None: """writes config log file diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py index 2935c0a2..5c549278 100644 --- a/big_scape/data/sqlite.py +++ b/big_scape/data/sqlite.py @@ -481,8 +481,8 @@ def check_config_hash(): if latest_config and BigscapeConfig.HASH != latest_config: raise RuntimeError( - "Config file values have changed from the previous run! " - "Existing data is not guarenteed to be reusable, please " + "Relevant config file values have changed (see config.log) from the " + "previous run! Existing data is not guarenteed to be reusable, please " "run with a fresh output directory/database." ) From 5076608c7500be0dfec36c436876acaf0c5cce71 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 18 Dec 2024 13:40:58 +0100 Subject: [PATCH 02/15] swap from toml to tomllib --- big_scape/utility/version.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/big_scape/utility/version.py b/big_scape/utility/version.py index df85de3a..daefdd02 100644 --- a/big_scape/utility/version.py +++ b/big_scape/utility/version.py @@ -1,8 +1,7 @@ """Module that contains helper functions specifically related to the bigscape version """ -import toml - +import tomllib from importlib import metadata from pathlib import Path @@ -21,7 +20,8 @@ def get_bigscape_version() -> str: pyproject_toml = Path(__file__).parent.parent.parent / "pyproject.toml" if pyproject_toml.exists(): - return toml.load(pyproject_toml)["project"]["version"] + with open(pyproject_toml, "rb") as fp: + return tomllib.load(fp)["project"]["version"] # if not, we're probably running as a package. get the version of the currently # installed big-scape package From 56d092d035824c383876c25bd0369de25a607bfd Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 18 Dec 2024 13:41:54 +0100 Subject: [PATCH 03/15] change to lowercase folder name --- ...SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename {Figures => figures}/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png (100%) diff --git a/Figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png b/figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png similarity index 100% rename from Figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png rename to figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png From 848f7479dc2cc263be9a07b7e7642888b9c54766 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 18 Dec 2024 13:52:43 +0100 Subject: [PATCH 04/15] add missing test __init__.py --- test/comparison/__init__.py | 1 + test/hmm/__init__.py | 1 + test/network/__init__.py | 1 + 3 files changed, 3 insertions(+) create mode 100644 test/comparison/__init__.py create mode 100644 test/hmm/__init__.py create mode 100644 test/network/__init__.py diff --git a/test/comparison/__init__.py b/test/comparison/__init__.py new file mode 100644 index 00000000..03dc8e43 --- /dev/null +++ b/test/comparison/__init__.py @@ -0,0 +1 @@ +"""Contains tests for distance calculation""" diff --git a/test/hmm/__init__.py b/test/hmm/__init__.py new file mode 100644 index 00000000..c6dfb8cc --- /dev/null +++ b/test/hmm/__init__.py @@ -0,0 +1 @@ +"""Contains tests involving (py)HMMer analysis and processing""" diff --git a/test/network/__init__.py b/test/network/__init__.py new file mode 100644 index 00000000..8f435c38 --- /dev/null +++ b/test/network/__init__.py @@ -0,0 +1 @@ +"""Contains tests involving CC and family generation""" From cd3b2f21b1f7c81afaca5f7834ec9c6b80628a2e Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 18 Dec 2024 14:01:46 +0100 Subject: [PATCH 05/15] remove unused networkx dependency --- big_scape/network/families.py | 61 ----------------------------------- big_scape/network/utility.py | 17 ---------- environment.yml | 1 - pyproject.toml | 1 - 4 files changed, 80 deletions(-) diff --git a/big_scape/network/families.py b/big_scape/network/families.py index 10a54ba4..3f3bd15e 100644 --- a/big_scape/network/families.py +++ b/big_scape/network/families.py @@ -4,9 +4,6 @@ import sys from typing import Callable, Optional import warnings -import numpy as np -import networkx -import math import logging # from dependencies @@ -105,24 +102,6 @@ def generate_families( return regions_families -def get_cc_edge_weight_std(connected_component) -> float: - """calculates the standard deviation of the edge weights of a connected component - - Args: - connected_component (list[tuple[int, int, float, float, float, float, str]]): - connected component in the form of a list of edges - - Returns: - float: standard deviation of the edge weights of the connected component - """ - - edge_weights = [edge[2] for edge in connected_component] - edge_std = np.std(edge_weights) - edge_std = round(edge_std, 2) - - return edge_std - - def get_cc_density( connected_component: list[tuple[int, int, float, float, float, float, int]] ) -> float: @@ -148,46 +127,6 @@ def get_cc_density( return cc_density -def test_centrality(connected_component, node_fraction) -> tuple[bool, list[int]]: - """tests if a network will break when removing the top nodes - with highest betweenness centrality - - Args: - connected_component (list[tuple[int, int, float, float, float, float, str]]): - connected component in the form of a list of edges - node_fraction (float): fraction of nodes with highest betweenness centrality to remove - - Returns: - tuple[bool, list[int]]: whether the network breaks and the list of nodes sorted by betweenness centrality - """ - - edgelist = [(edge[0], edge[1], edge[2]) for edge in connected_component] - - graph = networkx.Graph() - graph.add_weighted_edges_from(edgelist) - - betweeness_centrality_dict = networkx.betweenness_centrality(graph) - sorted_between_bentrality_nodes = sorted( - betweeness_centrality_dict, key=betweeness_centrality_dict.get, reverse=True - ) - - # round up to nearest integer - top_nodes = math.ceil(len(sorted_between_bentrality_nodes) * node_fraction) - nodes_to_remove = sorted_between_bentrality_nodes[:top_nodes] - - for node in nodes_to_remove: - graph.remove_node(node) - - nr_ccs = networkx.number_connected_components(graph) - - del graph - - if nr_ccs > 1: - return True, sorted_between_bentrality_nodes - - return False, sorted_between_bentrality_nodes - - def aff_sim_matrix(matrix, preference: Optional[float] = None): """Execute affinity propagation on a __similarity__ matrix diff --git a/big_scape/network/utility.py b/big_scape/network/utility.py index ab3a9dfd..284c544d 100644 --- a/big_scape/network/utility.py +++ b/big_scape/network/utility.py @@ -2,23 +2,6 @@ # from dependencies import numpy as np -import networkx as nx - - -def sim_matrix_from_graph(graph: nx.Graph, edge_property: str) -> np.ndarray: - """Return a similarity matrix from a graph in the form of a numpy array - - Args: - graph (Graph): graph - edge_property (str): _description_ - - Returns: - ndarray: _description_ - """ - matrix = nx.to_numpy_array(graph, weight=edge_property, nonedge=1.0) - # have to convert from distances to similarity - matrix = 1 - matrix - return matrix def edge_list_to_adj_list( diff --git a/environment.yml b/environment.yml index 1a86722f..e668390c 100644 --- a/environment.yml +++ b/environment.yml @@ -7,7 +7,6 @@ dependencies: - biopython=1.81 - sortedcontainers=2.4.0 - fasttree=2.1.11 - - networkx=3.1 - numpy=1.26.0 - pandas=2.1.1 - pyhmmer=0.10.14 diff --git a/pyproject.toml b/pyproject.toml index a68e0a86..2a3e1506 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,6 @@ dev = [ # type stubs (https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports) "types-psutil", - "networkx-stubs", "data-science-types", "types-tqdm", "types-setuptools" From a6010774c1cf510892c95d6c46f2f58f5a4a1775 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 18 Dec 2024 14:08:21 +0100 Subject: [PATCH 06/15] fix tests --- test/network/test_family.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/test/network/test_family.py b/test/network/test_family.py index 95e904d2..07c0bcbe 100644 --- a/test/network/test_family.py +++ b/test/network/test_family.py @@ -121,18 +121,6 @@ def test_aff_sim_matrix(self): self.assertListEqual(expected_labels, actual_labels) - def test_get_cc_edge_weight_std(self): - """Tests whether the standard deviation of the edge weights of a connected - component is correctly calculated - """ - adj_list = TestAffinityPropagation.gen_edge_list() - - expected_std = 0.12 - - actual_std = bs_families.get_cc_edge_weight_std(adj_list) - - self.assertEqual(expected_std, actual_std) - def test_get_cc_density(self): """Tests whether the density of a connected component is correctly calculated @@ -146,17 +134,3 @@ def test_get_cc_density(self): actual_density = bs_families.get_cc_density(adj_list) self.assertEqual(expected_density, actual_density) - - def test_test_centrality(self): - """Tests whether the test_centrality function correctly identifies a network - that will break when removing the top nodes with highest betweenness centrality - """ - adj_list = TestAffinityPropagation.gen_edge_list_alt() - - expected_break = True - - actual_break, actual_sorted_centrality_nodes = bs_families.test_centrality( - adj_list, 0.3 - ) - - self.assertEqual(expected_break, actual_break) From a8ef76f5c2a4752c33f7703048e55e38cfa56bb0 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Thu, 19 Dec 2024 09:14:17 +0100 Subject: [PATCH 07/15] split tests from badges yml --- .github/workflows/badges.yml | 28 ------------------------- .github/workflows/run-tests.yml | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 28 deletions(-) create mode 100644 .github/workflows/run-tests.yml diff --git a/.github/workflows/badges.yml b/.github/workflows/badges.yml index 6bcfed64..ee2e3be0 100644 --- a/.github/workflows/badges.yml +++ b/.github/workflows/badges.yml @@ -5,34 +5,6 @@ on: branches: - master -jobs: - run_pytest: - runs-on: ubuntu-latest - if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }} - timeout-minutes: 15 - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Set up mamba environment - uses: mamba-org/setup-micromamba@v1 - with: - micromamba-version: '1.3.1-0' - environment-file: environment.yml - environment-name: BiG-SCAPE - init-shell: bash - generate-run-shell: true - - - name: Install dependencies - shell: micromamba-shell {0} - run: | - python -m pip install pytest - - - name: Test with Pytest - shell: micromamba-shell {0} - run: | - pytest - generate_coverage: runs-on: ubuntu-latest if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }} diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml new file mode 100644 index 00000000..589e11b9 --- /dev/null +++ b/.github/workflows/run-tests.yml @@ -0,0 +1,36 @@ +name: Run tests + +on: + push: + branches: + - master + - dev + - release/* + +jobs: + run_pytest: + runs-on: ubuntu-latest + if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }} + timeout-minutes: 15 + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up mamba environment + uses: mamba-org/setup-micromamba@v1 + with: + micromamba-version: '1.3.1-0' + environment-file: environment.yml + environment-name: BiG-SCAPE + init-shell: bash + generate-run-shell: true + + - name: Install dependencies + shell: micromamba-shell {0} + run: | + python -m pip install pytest + + - name: Test with Pytest + shell: micromamba-shell {0} + run: | + pytest From b05cf989a23758f8d954ba51b5e810cd20146307 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Thu, 19 Dec 2024 09:17:57 +0100 Subject: [PATCH 08/15] add more branch types --- .github/workflows/run-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 589e11b9..589e63fc 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -6,6 +6,8 @@ on: - master - dev - release/* + - feature/* + - hotfix/* jobs: run_pytest: From 95714b085ce788a033401a746fcd7f49ad136819 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Fri, 20 Dec 2024 11:03:37 +0100 Subject: [PATCH 09/15] fix pytest badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 621749eb..7fac44d8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ![License](https://img.shields.io/github/license/medema-group/BiG-SCAPE) ![Github downloads](https://img.shields.io/github/downloads/medema-group/BiG-SCAPE/latest/total?label=Github%20downloads%20%28latest%29) ![Conda downloads](https://img.shields.io/conda/dn/bioconda/bigscape?label=Conda%20downloads) -![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/test.yml/badge.svg) +![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/run-tests.yml/badge.svg) ![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg) ![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg) From 3d5cad0dba4f16d556d1f7ffc9bb5eddfd30f029 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Fri, 20 Dec 2024 11:03:45 +0100 Subject: [PATCH 10/15] add docker image upload badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7fac44d8..633172e3 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ ![Github downloads](https://img.shields.io/github/downloads/medema-group/BiG-SCAPE/latest/total?label=Github%20downloads%20%28latest%29) ![Conda downloads](https://img.shields.io/conda/dn/bioconda/bigscape?label=Conda%20downloads) ![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/run-tests.yml/badge.svg) +![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/deploy-docker.yml/badge.svg) ![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg) ![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg) From 032094fbf4bd72c7eaa27ff2317837b5e62cf2c0 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Tue, 15 Oct 2024 16:36:27 +0200 Subject: [PATCH 11/15] implement temp table for hashes --- big_scape/genbank/gbk.py | 80 ++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 11 deletions(-) diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py index 3b1e01ee..8c6c764a 100644 --- a/big_scape/genbank/gbk.py +++ b/big_scape/genbank/gbk.py @@ -6,6 +6,8 @@ # from enum import Enum from pathlib import Path +import random +import string from typing import Dict, Optional import hashlib @@ -14,6 +16,7 @@ from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature +from sqlalchemy import Column, ForeignKey, Integer, String, Table, select # from other modules from big_scape.errors import InvalidGBKError @@ -34,6 +37,57 @@ # MIBIG = "mibig" # REFERENCE = "reference" +# TODO: generalize creating temp tables. this is copied from network.py + + +def create_temp_hash_table(hashes: list[str]) -> Table: + """Create a temporary table with ids of given records + + Args: + include_records (list[BGCRecord]): the records to include in the connected component + + Returns: + Table: the temporary table + """ + + # generate a short random string + temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10)) + + temp_table = Table( + temp_table_name, + DB.metadata, + Column( + "hash", + String, + ForeignKey(DB.metadata.tables["gbk"].c.hash), + primary_key=True, + nullable=False, + ), + prefixes=["TEMPORARY"], + ) + + DB.metadata.create_all(DB.engine) + + if DB.engine is None: + raise RuntimeError("DB engine is None") + + cursor = DB.engine.raw_connection().driver_connection.cursor() + + insert_query = f""" + INSERT INTO {temp_table_name} (hash) VALUES (?); + """ + + cursor.executemany(insert_query, [(x,) for x in hashes]) # type: ignore + + cursor.close() + + DB.commit() + + if DB.metadata is None: + raise ValueError("DB metadata is None") + + return temp_table + class GBK: """ @@ -261,7 +315,11 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: list[GBK]: loaded GBK objects """ - input_gbk_hashes = [gbk.hash for gbk in input_gbks] + def iter_hashes(): + for gbk in input_gbks: + yield gbk.hash + + temp_hash_table = create_temp_hash_table(iter_hashes()) if not DB.metadata: raise RuntimeError("DB.metadata is None") @@ -278,7 +336,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: gbk_table.c.taxonomy, gbk_table.c.description, ) - .where(gbk_table.c.hash.in_(input_gbk_hashes)) + .where(gbk_table.c.hash.in_(select(temp_hash_table.c.hash))) .compile() ) @@ -635,15 +693,15 @@ def collapse_hybrids_in_cand_clusters( for number in cand_cluster.proto_clusters.keys() ] merged_protocluster = MergedProtoCluster.merge(protoclusters) - merged_tmp_proto_clusters[ - merged_protocluster.number - ] = merged_protocluster + merged_tmp_proto_clusters[merged_protocluster.number] = ( + merged_protocluster + ) # update the protocluster old:new ids for the merged protoclusters of this cand_cluster for proto_cluster_num in cand_cluster.proto_clusters.keys(): - merged_protocluster_ids[ - proto_cluster_num - ] = merged_protocluster.number + merged_protocluster_ids[proto_cluster_num] = ( + merged_protocluster.number + ) # now we build a new version of the tmp_proto_clusters dict that contains the merged protoclusters # as well as protoclusters which did not need merging, with updated unique IDs/numbers @@ -657,9 +715,9 @@ def collapse_hybrids_in_cand_clusters( # this protocluster has been merged, so we need to add it to # the dict with its new protocluster number new_proto_cluster_num = merged_protocluster_ids[proto_cluster_num] - updated_tmp_proto_clusters[ - new_proto_cluster_num - ] = merged_tmp_proto_clusters[new_proto_cluster_num] + updated_tmp_proto_clusters[new_proto_cluster_num] = ( + merged_tmp_proto_clusters[new_proto_cluster_num] + ) updated_proto_cluster_dict[new_proto_cluster_num] = None else: # protoclusters which have not been merged are added to the dict as is From 441b668ff8eecf762f6edf96d2762b4c475b69d7 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Wed, 16 Oct 2024 10:40:48 +0200 Subject: [PATCH 12/15] batch hash insertions --- big_scape/genbank/gbk.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py index 8c6c764a..2febd7e3 100644 --- a/big_scape/genbank/gbk.py +++ b/big_scape/genbank/gbk.py @@ -40,7 +40,7 @@ # TODO: generalize creating temp tables. this is copied from network.py -def create_temp_hash_table(hashes: list[str]) -> Table: +def create_temp_hash_table(gbks: list[GBK]) -> Table: """Create a temporary table with ids of given records Args: @@ -77,7 +77,13 @@ def create_temp_hash_table(hashes: list[str]) -> Table: INSERT INTO {temp_table_name} (hash) VALUES (?); """ - cursor.executemany(insert_query, [(x,) for x in hashes]) # type: ignore + def batch_hash(gbks: list[GBK], n: int): + l = len(gbks) + for ndx in range(0, l, n): + yield [gbk.hash for gbk in gbks[ndx : min(ndx + n, l)]] + + for hash_batch in batch_hash(gbks, 1000): + cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore cursor.close() @@ -315,11 +321,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: list[GBK]: loaded GBK objects """ - def iter_hashes(): - for gbk in input_gbks: - yield gbk.hash - - temp_hash_table = create_temp_hash_table(iter_hashes()) + temp_hash_table = create_temp_hash_table(input_gbks) if not DB.metadata: raise RuntimeError("DB.metadata is None") From 15f1c9402959ff6df316e74c0c16d5119e534ea1 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Fri, 20 Dec 2024 15:32:31 +0100 Subject: [PATCH 13/15] use a temp table for gbk ids in record loading --- big_scape/genbank/candidate_cluster.py | 14 +++-- big_scape/genbank/cds.py | 12 +++- big_scape/genbank/gbk.py | 79 ++++++++++++++++++++++---- big_scape/genbank/proto_cluster.py | 15 ++++- big_scape/genbank/proto_core.py | 14 ++++- big_scape/genbank/region.py | 14 +++-- 6 files changed, 120 insertions(+), 28 deletions(-) diff --git a/big_scape/genbank/candidate_cluster.py b/big_scape/genbank/candidate_cluster.py index 1e0c64cb..e1626023 100644 --- a/big_scape/genbank/candidate_cluster.py +++ b/big_scape/genbank/candidate_cluster.py @@ -7,6 +7,7 @@ # from dependencies from Bio.SeqFeature import SeqFeature +from sqlalchemy import Table, select # from other modules from big_scape.data import DB @@ -169,7 +170,7 @@ def __repr__(self) -> str: return f"{self.parent_gbk} Candidate cluster {self.number} {self.nt_start}-{self.nt_stop} " @staticmethod - def load_all(region_dict: dict[int, Region]): + def load_all(region_dict: dict[int, Region], temp_gbk_id_table: Table = None): """Load all CandidateCluster objects from the database This function populates the CandidateCluster lists in the Regions provided in @@ -198,10 +199,15 @@ def load_all(region_dict: dict[int, Region]): record_table.c.product, ) .where(record_table.c.record_type == "cand_cluster") - .where(record_table.c.parent_id.in_(region_dict.keys())) - .compile() ) + if temp_gbk_id_table is not None: + candidate_cluster_select_query = candidate_cluster_select_query.where( + record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id)) + ) + + candidate_cluster_select_query = candidate_cluster_select_query.compile() + cursor_result = DB.execute(candidate_cluster_select_query) candidate_cluster_dict = {} @@ -230,4 +236,4 @@ def load_all(region_dict: dict[int, Region]): # add to dictionary candidate_cluster_dict[result.id] = new_candidate_cluster - ProtoCluster.load_all(candidate_cluster_dict) + ProtoCluster.load_all(candidate_cluster_dict, temp_gbk_id_table) diff --git a/big_scape/genbank/cds.py b/big_scape/genbank/cds.py index 752845c6..e91aeae1 100644 --- a/big_scape/genbank/cds.py +++ b/big_scape/genbank/cds.py @@ -10,6 +10,7 @@ from Bio.SeqFeature import SeqFeature from Bio.Seq import Seq from Bio import BiopythonWarning +from sqlalchemy import Table, select # from other modules from big_scape.errors import InvalidGBKError @@ -320,7 +321,7 @@ def len_nt_overlap(cds_a: CDS, cds_b: CDS) -> int: return max(0, right - left) @staticmethod - def load_all(gbk_dict: dict[int, GBK]) -> None: + def load_all(gbk_dict: dict[int, GBK], temp_gbk_id_table: Table = None) -> None: """Load all Region objects from the database This function populates the region objects in the GBKs provided in the input @@ -349,10 +350,15 @@ def load_all(gbk_dict: dict[int, GBK]) -> None: cds_table.c.aa_seq, ) .order_by(cds_table.c.orf_num) - .where(cds_table.c.gbk_id.in_(gbk_dict.keys())) - .compile() ) + if temp_gbk_id_table is not None: + region_select_query = region_select_query.where( + cds_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id)) + ) + + region_select_query = region_select_query.compile() + cursor_result = DB.execute(region_select_query) for result in cursor_result.all(): diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py index 2febd7e3..a6844fc3 100644 --- a/big_scape/genbank/gbk.py +++ b/big_scape/genbank/gbk.py @@ -95,6 +95,61 @@ def batch_hash(gbks: list[GBK], n: int): return temp_table +def create_temp_gbk_id_table(gbks: list[GBK]) -> Table: + """Create a temporary table with ids of given gbks + + Args: + gbks (list[GBK]): the gbks to include in the connected component + + Returns: + Table: the temporary table + """ + + # generate a short random string + temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10)) + + temp_table = Table( + temp_table_name, + DB.metadata, + Column( + "gbk_id", + Integer, + ForeignKey(DB.metadata.tables["gbk"].c.id), + primary_key=True, + nullable=False, + ), + prefixes=["TEMPORARY"], + ) + + DB.metadata.create_all(DB.engine) + + if DB.engine is None: + raise RuntimeError("DB engine is None") + + cursor = DB.engine.raw_connection().driver_connection.cursor() + + insert_query = f""" + INSERT INTO {temp_table_name} (gbk_id) VALUES (?); + """ + + def batch_hash(gbks: list[GBK], n: int): + l = len(gbks) + for ndx in range(0, l, n): + yield [gbk._db_id for gbk in gbks[ndx : min(ndx + n, l)]] + + for hash_batch in batch_hash(gbks, 1000): + cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore + + cursor.close() + + DB.commit() + + if DB.metadata is None: + raise ValueError("DB metadata is None") + + return temp_table + + class GBK: """ Class to describe a given GBK file @@ -357,9 +412,11 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: # load GBK regions. This will also populate all record levels below region # e.g. candidate cluster, protocore if they exist - Region.load_all(gbk_dict) + temp_gbk_id_table = create_temp_gbk_id_table(input_gbks) - CDS.load_all(gbk_dict) + Region.load_all(gbk_dict, temp_gbk_id_table) + + CDS.load_all(gbk_dict, temp_gbk_id_table) return list(gbk_dict.values()) @@ -695,15 +752,15 @@ def collapse_hybrids_in_cand_clusters( for number in cand_cluster.proto_clusters.keys() ] merged_protocluster = MergedProtoCluster.merge(protoclusters) - merged_tmp_proto_clusters[merged_protocluster.number] = ( - merged_protocluster - ) + merged_tmp_proto_clusters[ + merged_protocluster.number + ] = merged_protocluster # update the protocluster old:new ids for the merged protoclusters of this cand_cluster for proto_cluster_num in cand_cluster.proto_clusters.keys(): - merged_protocluster_ids[proto_cluster_num] = ( - merged_protocluster.number - ) + merged_protocluster_ids[ + proto_cluster_num + ] = merged_protocluster.number # now we build a new version of the tmp_proto_clusters dict that contains the merged protoclusters # as well as protoclusters which did not need merging, with updated unique IDs/numbers @@ -717,9 +774,9 @@ def collapse_hybrids_in_cand_clusters( # this protocluster has been merged, so we need to add it to # the dict with its new protocluster number new_proto_cluster_num = merged_protocluster_ids[proto_cluster_num] - updated_tmp_proto_clusters[new_proto_cluster_num] = ( - merged_tmp_proto_clusters[new_proto_cluster_num] - ) + updated_tmp_proto_clusters[ + new_proto_cluster_num + ] = merged_tmp_proto_clusters[new_proto_cluster_num] updated_proto_cluster_dict[new_proto_cluster_num] = None else: # protoclusters which have not been merged are added to the dict as is diff --git a/big_scape/genbank/proto_cluster.py b/big_scape/genbank/proto_cluster.py index 8a1dbf6e..53c6cdfd 100644 --- a/big_scape/genbank/proto_cluster.py +++ b/big_scape/genbank/proto_cluster.py @@ -7,6 +7,7 @@ # from dependencies from Bio.SeqFeature import SeqFeature +from sqlalchemy import Table, select # from other modules from big_scape.data import DB @@ -179,7 +180,10 @@ def __repr__(self) -> str: return f"{self.parent_gbk} ProtoCluster {self.number} {self.nt_start}-{self.nt_stop} " @staticmethod - def load_all(candidate_cluster_dict: dict[int, CandidateCluster]): + def load_all( + candidate_cluster_dict: dict[int, CandidateCluster], + temp_gbk_id_table: Table = None, + ): """Load all ProtoCluster objects from the database This function populates the CandidateCluster objects in the GBKs provided in the @@ -210,10 +214,15 @@ def load_all(candidate_cluster_dict: dict[int, CandidateCluster]): record_table.c.merged, ) .where(record_table.c.record_type == "protocluster") - .where(record_table.c.parent_id.in_(candidate_cluster_dict.keys())) - .compile() ) + if temp_gbk_id_table is not None: + protocluster_select_query = protocluster_select_query.where( + record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id)) + ) + + protocluster_select_query = protocluster_select_query.compile() + cursor_result = DB.execute(protocluster_select_query) protocluster_dict = {} diff --git a/big_scape/genbank/proto_core.py b/big_scape/genbank/proto_core.py index b5cc6481..090d9b2c 100644 --- a/big_scape/genbank/proto_core.py +++ b/big_scape/genbank/proto_core.py @@ -7,6 +7,7 @@ # from dependencies from Bio.SeqFeature import SeqFeature +from sqlalchemy import Table, select # from other modules from big_scape.data import DB @@ -110,7 +111,9 @@ def __repr__(self) -> str: ) @staticmethod - def load_all(protocluster_dict: dict[int, ProtoCluster]): + def load_all( + protocluster_dict: dict[int, ProtoCluster], temp_gbk_id_table: Table = None + ): """Load all ProtoCore objects from the database This function populates the region objects in the GBKs provided in the input @@ -141,10 +144,15 @@ def load_all(protocluster_dict: dict[int, ProtoCluster]): record_table.c.merged, ) .where(record_table.c.record_type == "proto_core") - .where(record_table.c.parent_id.in_(protocluster_dict.keys())) - .compile() ) + if temp_gbk_id_table is not None: + region_select_query = region_select_query.where( + record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id)) + ) + + region_select_query = region_select_query.compile() + cursor_result = DB.execute(region_select_query) for result in cursor_result.all(): diff --git a/big_scape/genbank/region.py b/big_scape/genbank/region.py index 7b7c6d45..3cd85ba7 100644 --- a/big_scape/genbank/region.py +++ b/big_scape/genbank/region.py @@ -8,6 +8,7 @@ # from dependencies from Bio.SeqFeature import SeqFeature from Bio.SeqRecord import SeqRecord +from sqlalchemy import Table, select # from other modules from big_scape.data import DB @@ -262,7 +263,7 @@ def __repr__(self): return f"{self.parent_gbk} Region {self.number} {self.nt_start}-{self.nt_stop} " @staticmethod - def load_all(gbk_dict: dict[int, GBK]) -> None: + def load_all(gbk_dict: dict[int, GBK], temp_gbk_id_table: Table = None) -> None: """Load all Region objects from the database This function populates the region objects in the GBKs provided in the input @@ -292,10 +293,15 @@ def load_all(gbk_dict: dict[int, GBK]) -> None: record_table.c.product, ) .where(record_table.c.record_type == "region") - .where(record_table.c.gbk_id.in_(gbk_dict.keys())) - .compile() ) + if temp_gbk_id_table is not None: + region_select_query = region_select_query.where( + record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id)) + ) + + region_select_query = region_select_query.compile() + cursor_result = DB.execute(region_select_query) region_dict = {} @@ -320,4 +326,4 @@ def load_all(gbk_dict: dict[int, GBK]) -> None: # add to dictionary region_dict[result.id] = new_region - CandidateCluster.load_all(region_dict) + CandidateCluster.load_all(region_dict, temp_gbk_id_table) From 645c9867445a2ab8d9e9249437d3e25539562f68 Mon Sep 17 00:00:00 2001 From: adraismawur Date: Mon, 16 Dec 2024 14:58:22 +0000 Subject: [PATCH 14/15] Update version number to new_version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2a3e1506..b7898b9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name ="big-scape" -version = "release/2.0.0" +version = "2.0.0-beta.2" description = "Biosynthetic Gene Similarity Clustering and Prospecting Engine" requires-python = ">=3.11" license = { file = "LICENSE" } From 1912a4cfefef82a0d86e6a7c90ed4d6f1e204444 Mon Sep 17 00:00:00 2001 From: adraismawur Date: Fri, 20 Dec 2024 17:25:31 +0000 Subject: [PATCH 15/15] Update version number to new_version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b7898b9c..baa2ff18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name ="big-scape" -version = "2.0.0-beta.2" +version = "2.0.0-beta.3" description = "Biosynthetic Gene Similarity Clustering and Prospecting Engine" requires-python = ">=3.11" license = { file = "LICENSE" }