diff --git a/CHANGELOG.md b/CHANGELOG.md index e8a8201..170e31a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,8 @@ Serovar nomenclature update after USA Cantaloupe Outbreaks in November 2023. The | Poano - Stafford | Poano | ### Changes of serovar assignments in `sistr/data/genomes-to-serovar.txt` file - +The following updates were done to better reflect the O24 and O25 nomenclature together with updated +Paratyphi B and Paratyphi B var. Java few genome designations with correct variant assignments |genome accession | serovar previous | serovar current | |-----------------|------------------|-----------------| | SAL_DA9822AA | Soahanina |Sundsvall @@ -80,6 +81,12 @@ Serovar nomenclature update after USA Cantaloupe Outbreaks in November 2023. The |SAL_DA7014AA |Martonos | Finkenwerder |SRR1300569 |Martonos | Finkenwerder |SRR1973814 |Martonos | Finkenwerder +|17-2557 |Paratyphi B var. Java | Paratyphi B +|17-8544 |Paratyphi B var. Java | Paratyphi B +|17-9304 |Paratyphi B var. Java | Paratyphi B +|18-0116 |Paratyphi B var. Java | Paratyphi B +|17-7324 |Paratyphi B var. Java | Paratyphi B +|17-9312 |Paratyphi B var. Java | Paratyphi B ### Changes to `Salmonella-serotype_serogroup_antigen_table-WHO_2007.csv` antigen to serovar lookup database Removed the following entries diff --git a/README.rst b/README.rst index a9aa898..8ae3fc1 100644 --- a/README.rst +++ b/README.rst @@ -127,7 +127,7 @@ SISTR will automatically initialize database of *Salmonella* serovar determinati The SISTR database v1.3 got minor updates by collapsing some of the serovars with O24/O25 antigens detailed in `CHANGELOG.md `_ file - SISTR v1.1 database is available at https://zenodo.org/records/13618515 or via a direct url https://zenodo.org/records/13618515/files/SISTR_V_1.1_db.tar.gz?download=1 (used with SISTR < 1.1.3 ) -- SISTR v1.3 database is available at https://zenodo.org/records/13693495 or va a direct url https://zenodo.org/records/13693495/files/SISTR_V_1.1.3_db.tar.gz?download=1 (used with SISTR >= 1.1.3) +- SISTR v1.3 database is available at https://zenodo.org/records/14270992 or va a direct url https://zenodo.org/records/14270992/files/SISTR_V_1.1.3_db.tar.gz?download=1 (used with SISTR >= 1.1.3) Dependencies diff --git a/sistr/src/blast_wrapper/__init__.py b/sistr/src/blast_wrapper/__init__.py index 60a8260..ecb9e05 100644 --- a/sistr/src/blast_wrapper/__init__.py +++ b/sistr/src/blast_wrapper/__init__.py @@ -295,8 +295,9 @@ def top_result(self): if self.is_missing: return None + blast_candidate_antigens = set([i.split("|")[-1] for i in self.df['qseqid'].to_list()]) # df_perfect_matches = self.df[(self.df['coverage'] == 1.0) & (self.df['pident'] == 100.0)] - if df_perfect_matches.shape[0]: + if df_perfect_matches.shape[0] > 0 and len(blast_candidate_antigens) == 1: self.is_perfect_match = True return BlastReader.df_first_row_to_dict(df_perfect_matches) diff --git a/sistr/src/cgmlst/__init__.py b/sistr/src/cgmlst/__init__.py index c867ae9..4db378e 100644 --- a/sistr/src/cgmlst/__init__.py +++ b/sistr/src/cgmlst/__init__.py @@ -208,7 +208,7 @@ def matches_to_marker_results(df): if len(v) > 1: logging.debug('Multiple potential cgMLST allele matches (n=%s) found for marker %s. Selecting match on longest contig.', len(v), k) df_marker = pd.DataFrame(v) - df_marker.sort_values('slen', ascending=False, inplace=True) + df_marker.sort_values(['bitscore','length'], ascending=False, inplace=True) for i,r in df_marker.iterrows(): allele = r['allele_name'] slen = r['slen'] @@ -256,7 +256,7 @@ def find_closest_related_genome(marker_results, df_genome_profiles): df_relatives['matching'] = genome_profile_similarity_counts df_relatives['distance'] = 1.0 - (df_relatives['matching'] / float(n_markers)) df_relatives.index = df_genome_profiles.index - df_relatives.sort_values(by='distance', inplace=True) + df_relatives.sort_values(by=['distance','matching'], ascending=[True,False], inplace=True) return df_relatives diff --git a/sistr/src/serovar_prediction/__init__.py b/sistr/src/serovar_prediction/__init__.py index ba3f272..e0b75fd 100644 --- a/sistr/src/serovar_prediction/__init__.py +++ b/sistr/src/serovar_prediction/__init__.py @@ -133,8 +133,7 @@ def __init__(self, blast_runner): def search_for_wzx(self): self.wzx_prediction = self.get_antigen_gene_blast_results(self.wzx_prediction, WZX_FASTA_PATH) - #'blast_results', 'is_missing', 'is_perfect_match', 'is_trunc', 'serogroup', 'top_result' - #print( self.wzx_prediction.top_result); raise Exception() + if not self.wzx_prediction.is_missing and not self.wzx_prediction.top_result is None : top_result = self.wzx_prediction.top_result top_result_pident = top_result['pident'] diff --git a/sistr/src/serovar_prediction/constants.py b/sistr/src/serovar_prediction/constants.py index cf37cf0..2eb68f9 100644 --- a/sistr/src/serovar_prediction/constants.py +++ b/sistr/src/serovar_prediction/constants.py @@ -8,7 +8,7 @@ CGMLST_SUBSPECIATION_DISTANCE_THRESHOLD = 0.9 MASH_SUBSPECIATION_DISTANCE_THRESHOLD = 0.01 -SISTR_DB_URL = 'https://zenodo.org/records/13693495/files/SISTR_V_1.1.3_db.tar.gz?download=1' +SISTR_DB_URL = 'https://zenodo.org/records/14270992/files/SISTR_V_1.1.3_db.tar.gz?download=1' SISTR_DATA_DIR = resource_filename('sistr','data') SEROVAR_TABLE_PATH = resource_filename('sistr', 'data/Salmonella-serotype_serogroup_antigen_table-WHO_2007.csv') WZX_FASTA_PATH = resource_filename('sistr', 'data/antigens/wzx.fasta')