From f6ddd58c975cd2ecdb2d14e91f8a0705990181c6 Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Tue, 3 Dec 2024 06:56:56 -0500 Subject: [PATCH 1/3] fixed ambiguous sorting of the cgMLST alleles. Now sorting by bitscore and length favouring the longest and highest scoring alleles matches. This generated different results on different configuations. Sometimes alleles might have identical bitscores but different alignment length and percent identity --- sistr/src/cgmlst/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sistr/src/cgmlst/__init__.py b/sistr/src/cgmlst/__init__.py index c867ae9..383469a 100644 --- a/sistr/src/cgmlst/__init__.py +++ b/sistr/src/cgmlst/__init__.py @@ -208,7 +208,7 @@ def matches_to_marker_results(df): if len(v) > 1: logging.debug('Multiple potential cgMLST allele matches (n=%s) found for marker %s. Selecting match on longest contig.', len(v), k) df_marker = pd.DataFrame(v) - df_marker.sort_values('slen', ascending=False, inplace=True) + df_marker.sort_values(['bitscore','length'], ascending=False, inplace=True) for i,r in df_marker.iterrows(): allele = r['allele_name'] slen = r['slen'] From 6ec08bcdce8f65de92e3c0efd7680111586f256a Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Tue, 3 Dec 2024 18:06:59 -0500 Subject: [PATCH 2/3] Improved perfect matches logic where perfect match top hit considered when %id and %coverage and only single antigen are present, otherwise scored by highest bitscore. The cgMLST hits are sorted explicitly by max matched hashes and min distance to the closest genome. Partially improved Paratyphi B and variant Java discrimination --- CHANGELOG.md | 9 ++++++++- sistr/src/blast_wrapper/__init__.py | 3 ++- sistr/src/cgmlst/__init__.py | 2 +- sistr/src/serovar_prediction/__init__.py | 3 +-- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8a8201..170e31a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,8 @@ Serovar nomenclature update after USA Cantaloupe Outbreaks in November 2023. The | Poano - Stafford | Poano | ### Changes of serovar assignments in `sistr/data/genomes-to-serovar.txt` file - +The following updates were done to better reflect the O24 and O25 nomenclature together with updated +Paratyphi B and Paratyphi B var. Java few genome designations with correct variant assignments |genome accession | serovar previous | serovar current | |-----------------|------------------|-----------------| | SAL_DA9822AA | Soahanina |Sundsvall @@ -80,6 +81,12 @@ Serovar nomenclature update after USA Cantaloupe Outbreaks in November 2023. The |SAL_DA7014AA |Martonos | Finkenwerder |SRR1300569 |Martonos | Finkenwerder |SRR1973814 |Martonos | Finkenwerder +|17-2557 |Paratyphi B var. Java | Paratyphi B +|17-8544 |Paratyphi B var. Java | Paratyphi B +|17-9304 |Paratyphi B var. Java | Paratyphi B +|18-0116 |Paratyphi B var. Java | Paratyphi B +|17-7324 |Paratyphi B var. Java | Paratyphi B +|17-9312 |Paratyphi B var. Java | Paratyphi B ### Changes to `Salmonella-serotype_serogroup_antigen_table-WHO_2007.csv` antigen to serovar lookup database Removed the following entries diff --git a/sistr/src/blast_wrapper/__init__.py b/sistr/src/blast_wrapper/__init__.py index 60a8260..ecb9e05 100644 --- a/sistr/src/blast_wrapper/__init__.py +++ b/sistr/src/blast_wrapper/__init__.py @@ -295,8 +295,9 @@ def top_result(self): if self.is_missing: return None + blast_candidate_antigens = set([i.split("|")[-1] for i in self.df['qseqid'].to_list()]) # df_perfect_matches = self.df[(self.df['coverage'] == 1.0) & (self.df['pident'] == 100.0)] - if df_perfect_matches.shape[0]: + if df_perfect_matches.shape[0] > 0 and len(blast_candidate_antigens) == 1: self.is_perfect_match = True return BlastReader.df_first_row_to_dict(df_perfect_matches) diff --git a/sistr/src/cgmlst/__init__.py b/sistr/src/cgmlst/__init__.py index 383469a..4db378e 100644 --- a/sistr/src/cgmlst/__init__.py +++ b/sistr/src/cgmlst/__init__.py @@ -256,7 +256,7 @@ def find_closest_related_genome(marker_results, df_genome_profiles): df_relatives['matching'] = genome_profile_similarity_counts df_relatives['distance'] = 1.0 - (df_relatives['matching'] / float(n_markers)) df_relatives.index = df_genome_profiles.index - df_relatives.sort_values(by='distance', inplace=True) + df_relatives.sort_values(by=['distance','matching'], ascending=[True,False], inplace=True) return df_relatives diff --git a/sistr/src/serovar_prediction/__init__.py b/sistr/src/serovar_prediction/__init__.py index ba3f272..e0b75fd 100644 --- a/sistr/src/serovar_prediction/__init__.py +++ b/sistr/src/serovar_prediction/__init__.py @@ -133,8 +133,7 @@ def __init__(self, blast_runner): def search_for_wzx(self): self.wzx_prediction = self.get_antigen_gene_blast_results(self.wzx_prediction, WZX_FASTA_PATH) - #'blast_results', 'is_missing', 'is_perfect_match', 'is_trunc', 'serogroup', 'top_result' - #print( self.wzx_prediction.top_result); raise Exception() + if not self.wzx_prediction.is_missing and not self.wzx_prediction.top_result is None : top_result = self.wzx_prediction.top_result top_result_pident = top_result['pident'] From ef7f510d794c8941143a75b93d7d72894b30492b Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Tue, 3 Dec 2024 18:20:23 -0500 Subject: [PATCH 3/3] updated readme with updated sistr database url --- README.rst | 2 +- sistr/src/serovar_prediction/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a9aa898..8ae3fc1 100644 --- a/README.rst +++ b/README.rst @@ -127,7 +127,7 @@ SISTR will automatically initialize database of *Salmonella* serovar determinati The SISTR database v1.3 got minor updates by collapsing some of the serovars with O24/O25 antigens detailed in `CHANGELOG.md `_ file - SISTR v1.1 database is available at https://zenodo.org/records/13618515 or via a direct url https://zenodo.org/records/13618515/files/SISTR_V_1.1_db.tar.gz?download=1 (used with SISTR < 1.1.3 ) -- SISTR v1.3 database is available at https://zenodo.org/records/13693495 or va a direct url https://zenodo.org/records/13693495/files/SISTR_V_1.1.3_db.tar.gz?download=1 (used with SISTR >= 1.1.3) +- SISTR v1.3 database is available at https://zenodo.org/records/14270992 or va a direct url https://zenodo.org/records/14270992/files/SISTR_V_1.1.3_db.tar.gz?download=1 (used with SISTR >= 1.1.3) Dependencies diff --git a/sistr/src/serovar_prediction/constants.py b/sistr/src/serovar_prediction/constants.py index cf37cf0..2eb68f9 100644 --- a/sistr/src/serovar_prediction/constants.py +++ b/sistr/src/serovar_prediction/constants.py @@ -8,7 +8,7 @@ CGMLST_SUBSPECIATION_DISTANCE_THRESHOLD = 0.9 MASH_SUBSPECIATION_DISTANCE_THRESHOLD = 0.01 -SISTR_DB_URL = 'https://zenodo.org/records/13693495/files/SISTR_V_1.1.3_db.tar.gz?download=1' +SISTR_DB_URL = 'https://zenodo.org/records/14270992/files/SISTR_V_1.1.3_db.tar.gz?download=1' SISTR_DATA_DIR = resource_filename('sistr','data') SEROVAR_TABLE_PATH = resource_filename('sistr', 'data/Salmonella-serotype_serogroup_antigen_table-WHO_2007.csv') WZX_FASTA_PATH = resource_filename('sistr', 'data/antigens/wzx.fasta')