Skip to content

Commit

Permalink
Merge pull request #241 from medema-group/release/2.0.0-beta.3
Browse files Browse the repository at this point in the history
Release/2.0.0 beta.3
  • Loading branch information
adraismawur authored Dec 20, 2024
2 parents 94b567f + 0ec0c2b commit 44a4cc7
Show file tree
Hide file tree
Showing 21 changed files with 248 additions and 163 deletions.
28 changes: 0 additions & 28 deletions .github/workflows/badges.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,6 @@ on:
branches:
- master

jobs:
run_pytest:
runs-on: ubuntu-latest
if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }}
timeout-minutes: 15
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up mamba environment
uses: mamba-org/setup-micromamba@v1
with:
micromamba-version: '1.3.1-0'
environment-file: environment.yml
environment-name: BiG-SCAPE
init-shell: bash
generate-run-shell: true

- name: Install dependencies
shell: micromamba-shell {0}
run: |
python -m pip install pytest
- name: Test with Pytest
shell: micromamba-shell {0}
run: |
pytest
generate_coverage:
runs-on: ubuntu-latest
if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }}
Expand Down
38 changes: 38 additions & 0 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Run tests

on:
push:
branches:
- master
- dev
- release/*
- feature/*
- hotfix/*

jobs:
run_pytest:
runs-on: ubuntu-latest
if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }}
timeout-minutes: 15
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up mamba environment
uses: mamba-org/setup-micromamba@v1
with:
micromamba-version: '1.3.1-0'
environment-file: environment.yml
environment-name: BiG-SCAPE
init-shell: bash
generate-run-shell: true

- name: Install dependencies
shell: micromamba-shell {0}
run: |
python -m pip install pytest
- name: Test with Pytest
shell: micromamba-shell {0}
run: |
pytest
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
![License](https://img.shields.io/github/license/medema-group/BiG-SCAPE)
![Github downloads](https://img.shields.io/github/downloads/medema-group/BiG-SCAPE/latest/total?label=Github%20downloads%20%28latest%29)
![Conda downloads](https://img.shields.io/conda/dn/bioconda/bigscape?label=Conda%20downloads)
![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/test.yml/badge.svg)
![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/run-tests.yml/badge.svg)
![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/deploy-docker.yml/badge.svg)
![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg)
![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg)

Expand Down
27 changes: 26 additions & 1 deletion big_scape/cli/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
"""
with open(config_file_path, "rb") as f:
content = f.read()
BigscapeConfig.HASH = hashlib.sha256(content).hexdigest()
config = yaml.load(content, Loader=yaml.FullLoader)

# PROFILER
Expand Down Expand Up @@ -212,10 +211,36 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
legacy_classes[group] = set(classes)
BigscapeConfig.LEGACY_ANTISMASH_CLASSES = legacy_classes

# store relevant hash
BigscapeConfig.generate_relevant_hash()

# write config log
if log_path is not None:
BigscapeConfig.write_config_log(log_path, config)

@staticmethod
def generate_relevant_hash() -> None:
"""Generates a config hash from values that might/will invalidate existing data"""
content = (
BigscapeConfig.MERGED_CAND_CLUSTER_TYPE,
BigscapeConfig.CDS_OVERLAP_CUTOFF,
BigscapeConfig.DOMAIN_OVERLAP_CUTOFF,
BigscapeConfig.REGION_MIN_LCS_LEN,
BigscapeConfig.PROTO_MIN_LCS_LEN,
BigscapeConfig.REGION_MIN_EXTEND_LEN,
BigscapeConfig.REGION_MIN_EXTEND_LEN_BIO,
BigscapeConfig.PROTO_MIN_EXTEND_LEN,
BigscapeConfig.NO_MIN_CLASSES,
BigscapeConfig.EXTEND_MATCH_SCORE,
BigscapeConfig.EXTEND_MISMATCH_SCORE,
BigscapeConfig.EXTEND_GAP_SCORE,
BigscapeConfig.EXTEND_MAX_MATCH_PERC,
BigscapeConfig.ANCHOR_DOMAINS,
)
BigscapeConfig.HASH = hashlib.sha256(
bytearray(str(content), "utf-8")
).hexdigest()

@staticmethod
def write_config_log(log_path: Path, config: dict) -> None:
"""writes config log file
Expand Down
4 changes: 2 additions & 2 deletions big_scape/data/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,8 @@ def check_config_hash():

if latest_config and BigscapeConfig.HASH != latest_config:
raise RuntimeError(
"Config file values have changed from the previous run! "
"Existing data is not guarenteed to be reusable, please "
"Relevant config file values have changed (see config.log) from the "
"previous run! Existing data is not guarenteed to be reusable, please "
"run with a fresh output directory/database."
)

Expand Down
14 changes: 10 additions & 4 deletions big_scape/genbank/candidate_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# from dependencies
from Bio.SeqFeature import SeqFeature
from sqlalchemy import Table, select

# from other modules
from big_scape.data import DB
Expand Down Expand Up @@ -169,7 +170,7 @@ def __repr__(self) -> str:
return f"{self.parent_gbk} Candidate cluster {self.number} {self.nt_start}-{self.nt_stop} "

@staticmethod
def load_all(region_dict: dict[int, Region]):
def load_all(region_dict: dict[int, Region], temp_gbk_id_table: Table = None):
"""Load all CandidateCluster objects from the database
This function populates the CandidateCluster lists in the Regions provided in
Expand Down Expand Up @@ -198,10 +199,15 @@ def load_all(region_dict: dict[int, Region]):
record_table.c.product,
)
.where(record_table.c.record_type == "cand_cluster")
.where(record_table.c.parent_id.in_(region_dict.keys()))
.compile()
)

if temp_gbk_id_table is not None:
candidate_cluster_select_query = candidate_cluster_select_query.where(
record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
)

candidate_cluster_select_query = candidate_cluster_select_query.compile()

cursor_result = DB.execute(candidate_cluster_select_query)

candidate_cluster_dict = {}
Expand Down Expand Up @@ -230,4 +236,4 @@ def load_all(region_dict: dict[int, Region]):
# add to dictionary
candidate_cluster_dict[result.id] = new_candidate_cluster

ProtoCluster.load_all(candidate_cluster_dict)
ProtoCluster.load_all(candidate_cluster_dict, temp_gbk_id_table)
12 changes: 9 additions & 3 deletions big_scape/genbank/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from Bio.SeqFeature import SeqFeature
from Bio.Seq import Seq
from Bio import BiopythonWarning
from sqlalchemy import Table, select

# from other modules
from big_scape.errors import InvalidGBKError
Expand Down Expand Up @@ -320,7 +321,7 @@ def len_nt_overlap(cds_a: CDS, cds_b: CDS) -> int:
return max(0, right - left)

@staticmethod
def load_all(gbk_dict: dict[int, GBK]) -> None:
def load_all(gbk_dict: dict[int, GBK], temp_gbk_id_table: Table = None) -> None:
"""Load all Region objects from the database
This function populates the region objects in the GBKs provided in the input
Expand Down Expand Up @@ -349,10 +350,15 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
cds_table.c.aa_seq,
)
.order_by(cds_table.c.orf_num)
.where(cds_table.c.gbk_id.in_(gbk_dict.keys()))
.compile()
)

if temp_gbk_id_table is not None:
region_select_query = region_select_query.where(
cds_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
)

region_select_query = region_select_query.compile()

cursor_result = DB.execute(region_select_query)

for result in cursor_result.all():
Expand Down
125 changes: 121 additions & 4 deletions big_scape/genbank/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

# from enum import Enum
from pathlib import Path
import random
import string
from typing import Dict, Optional
import hashlib

Expand All @@ -14,6 +16,7 @@
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature
from sqlalchemy import Column, ForeignKey, Integer, String, Table, select

# from other modules
from big_scape.errors import InvalidGBKError
Expand All @@ -34,6 +37,118 @@
# MIBIG = "mibig"
# REFERENCE = "reference"

# TODO: generalize creating temp tables. this is copied from network.py


def create_temp_hash_table(gbks: list[GBK]) -> Table:
"""Create a temporary table with ids of given records
Args:
include_records (list[BGCRecord]): the records to include in the connected component
Returns:
Table: the temporary table
"""

# generate a short random string
temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))

temp_table = Table(
temp_table_name,
DB.metadata,
Column(
"hash",
String,
ForeignKey(DB.metadata.tables["gbk"].c.hash),
primary_key=True,
nullable=False,
),
prefixes=["TEMPORARY"],
)

DB.metadata.create_all(DB.engine)

if DB.engine is None:
raise RuntimeError("DB engine is None")

cursor = DB.engine.raw_connection().driver_connection.cursor()

insert_query = f"""
INSERT INTO {temp_table_name} (hash) VALUES (?);
"""

def batch_hash(gbks: list[GBK], n: int):
l = len(gbks)
for ndx in range(0, l, n):
yield [gbk.hash for gbk in gbks[ndx : min(ndx + n, l)]]

for hash_batch in batch_hash(gbks, 1000):
cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore

cursor.close()

DB.commit()

if DB.metadata is None:
raise ValueError("DB metadata is None")

return temp_table


def create_temp_gbk_id_table(gbks: list[GBK]) -> Table:
"""Create a temporary table with ids of given gbks
Args:
gbks (list[GBK]): the gbks to include in the connected component
Returns:
Table: the temporary table
"""

# generate a short random string
temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))

temp_table = Table(
temp_table_name,
DB.metadata,
Column(
"gbk_id",
Integer,
ForeignKey(DB.metadata.tables["gbk"].c.id),
primary_key=True,
nullable=False,
),
prefixes=["TEMPORARY"],
)

DB.metadata.create_all(DB.engine)

if DB.engine is None:
raise RuntimeError("DB engine is None")

cursor = DB.engine.raw_connection().driver_connection.cursor()

insert_query = f"""
INSERT INTO {temp_table_name} (gbk_id) VALUES (?);
"""

def batch_hash(gbks: list[GBK], n: int):
l = len(gbks)
for ndx in range(0, l, n):
yield [gbk._db_id for gbk in gbks[ndx : min(ndx + n, l)]]

for hash_batch in batch_hash(gbks, 1000):
cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore

cursor.close()

DB.commit()

if DB.metadata is None:
raise ValueError("DB metadata is None")

return temp_table


class GBK:
"""
Expand Down Expand Up @@ -261,7 +376,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
list[GBK]: loaded GBK objects
"""

input_gbk_hashes = [gbk.hash for gbk in input_gbks]
temp_hash_table = create_temp_hash_table(input_gbks)

if not DB.metadata:
raise RuntimeError("DB.metadata is None")
Expand All @@ -278,7 +393,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
gbk_table.c.taxonomy,
gbk_table.c.description,
)
.where(gbk_table.c.hash.in_(input_gbk_hashes))
.where(gbk_table.c.hash.in_(select(temp_hash_table.c.hash)))
.compile()
)

Expand All @@ -297,9 +412,11 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
# load GBK regions. This will also populate all record levels below region
# e.g. candidate cluster, protocore if they exist

Region.load_all(gbk_dict)
temp_gbk_id_table = create_temp_gbk_id_table(input_gbks)

CDS.load_all(gbk_dict)
Region.load_all(gbk_dict, temp_gbk_id_table)

CDS.load_all(gbk_dict, temp_gbk_id_table)

return list(gbk_dict.values())

Expand Down
Loading

0 comments on commit 44a4cc7

Please sign in to comment.