From 742f5ec42fc38a54fb7be999842128ef5de259aa Mon Sep 17 00:00:00 2001 From: ttubb Date: Wed, 6 Mar 2024 19:04:29 +0000 Subject: [PATCH] re0.9.1 --- .../01_samples_reads_assembly_bins_mags.yaml | 2 +- examples/02_samples_reads_assembly_bins.yaml | 2 +- examples/03_samples_reads_assembly.yaml | 2 +- examples/04_reads_assembly_bins_mags.yaml | 2 +- examples/05_reads_assembly_bins.yaml | 2 +- examples/06_reads_assembly.yaml | 2 +- examples/07_assembly_bins_mags.yaml | 2 +- examples/08_assembly_bins.yaml | 2 +- examples/09_assembly.yaml | 2 +- examples/10_bins_mags.yaml | 2 +- examples/11_bins.yaml | 2 +- examples/12_mags.yaml | 2 +- setup.py | 2 +- synum/assemblySubmission.py | 18 +- synum/binSubmission.py | 422 ++---------------- synum/enaSearching.py | 184 ++++++-- synum/magSubmission.py | 23 +- synum/main.py | 30 +- synum/preflight.py | 170 +++++-- synum/readSubmission.py | 39 +- synum/sampleSubmission.py | 4 +- synum/statConf.py | 3 +- synum/taxQuery.py | 2 +- synum/utility.py | 98 +++- 24 files changed, 481 insertions(+), 538 deletions(-) diff --git a/examples/01_samples_reads_assembly_bins_mags.yaml b/examples/01_samples_reads_assembly_bins_mags.yaml index 7b43a4b..f396d70 100644 --- a/examples/01_samples_reads_assembly_bins_mags.yaml +++ b/examples/01_samples_reads_assembly_bins_mags.yaml @@ -50,7 +50,7 @@ PAIRED_END_READS: RELATED_SAMPLE_TITLE: "idx00_main digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: "idx00_e01_coassembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" + ASSEMBLY_NAME: "idx00_e01_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/02_samples_reads_assembly_bins.yaml b/examples/02_samples_reads_assembly_bins.yaml index 8a489f0..2384e0f 100644 --- a/examples/02_samples_reads_assembly_bins.yaml +++ b/examples/02_samples_reads_assembly_bins.yaml @@ -31,7 +31,7 @@ SINGLE_READS: RELATED_SAMPLE_TITLE: 'idx00_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: 'idx00_e02_assembly' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: 'idx00_e02_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" ASSEMBLY_SOFTWARE: 'metaSPAdes' # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: 'ant fungus garden' # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/03_samples_reads_assembly.yaml b/examples/03_samples_reads_assembly.yaml index 468dc33..b6d1900 100644 --- a/examples/03_samples_reads_assembly.yaml +++ b/examples/03_samples_reads_assembly.yaml @@ -68,7 +68,7 @@ PAIRED_END_READS: RELATED_SAMPLE_TITLE: "idx00_main digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: "idx00_e01_coassembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" + ASSEMBLY_NAME: "idx00_e03_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/04_reads_assembly_bins_mags.yaml b/examples/04_reads_assembly_bins_mags.yaml index 30a0ff1..fbbf2d9 100644 --- a/examples/04_reads_assembly_bins_mags.yaml +++ b/examples/04_reads_assembly_bins_mags.yaml @@ -21,7 +21,7 @@ SINGLE_READS: RELATED_SAMPLE_ACCESSION: 'SAMEA113417017' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: "idx00_e01_coassembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" + ASSEMBLY_NAME: "idx00_e04_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/05_reads_assembly_bins.yaml b/examples/05_reads_assembly_bins.yaml index a1f8486..1b244ff 100644 --- a/examples/05_reads_assembly_bins.yaml +++ b/examples/05_reads_assembly_bins.yaml @@ -32,7 +32,7 @@ PAIRED_END_READS: RELATED_SAMPLE_ACCESSION: 'SAMEA113417018' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: "idx00_e01_coassembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" + ASSEMBLY_NAME: "idx00_e05_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/06_reads_assembly.yaml b/examples/06_reads_assembly.yaml index 7180046..cc603c9 100644 --- a/examples/06_reads_assembly.yaml +++ b/examples/06_reads_assembly.yaml @@ -30,7 +30,7 @@ PAIRED_END_READS: RELATED_SAMPLE_ACCESSION: 'SAMEA113417017' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: 'idx00_e06_assembly' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: 'idx00_e06_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" ASSEMBLY_SOFTWARE: 'metaSPAdes' # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: 'ant fungus garden' # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/07_assembly_bins_mags.yaml b/examples/07_assembly_bins_mags.yaml index a1dc271..6745d97 100644 --- a/examples/07_assembly_bins_mags.yaml +++ b/examples/07_assembly_bins_mags.yaml @@ -13,7 +13,7 @@ SEQUENCING_PLATFORMS: ["ILLUMINA"] # PROJECT_NAME: "idx00_AgRFex 2 Survey" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey" SAMPLE_ACCESSIONS: ['SAMEA113417017'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] ASSEMBLY: - ASSEMBLY_NAME: "idx00_ex07_assembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: "idx00_ex07_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/08_assembly_bins.yaml b/examples/08_assembly_bins.yaml index 41ce4d9..1554175 100644 --- a/examples/08_assembly_bins.yaml +++ b/examples/08_assembly_bins.yaml @@ -11,7 +11,7 @@ METAGENOME_TAXID: "718289" # SEQUENCING_PLATFORMS: ["ILLUMINA"] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] SAMPLE_ACCESSIONS: ['SAMEA113417017'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] ASSEMBLY: - ASSEMBLY_NAME: "idx00_ex08_assembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: "idx00_ex08_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/09_assembly.yaml b/examples/09_assembly.yaml index f382512..ef9d036 100644 --- a/examples/09_assembly.yaml +++ b/examples/09_assembly.yaml @@ -10,7 +10,7 @@ METAGENOME_TAXID: "718289" # SEQUENCING_PLATFORMS: ["ILLUMINA"] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] SAMPLE_ACCESSIONS: ['SAMEA113417017','SAMEA113417018'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] ASSEMBLY: - ASSEMBLY_NAME: "idx00_ex09_assembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: "idx00_ex09_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/examples/10_bins_mags.yaml b/examples/10_bins_mags.yaml index d1963c7..cd8a551 100644 --- a/examples/10_bins_mags.yaml +++ b/examples/10_bins_mags.yaml @@ -11,7 +11,7 @@ SEQUENCING_PLATFORMS: ["ILLUMINA"] # PROJECT_NAME: "Project ex10 idx00" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey" SAMPLE_ACCESSIONS: ['SAMEA113417017','SAMEA113417018'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] ASSEMBLY: - ASSEMBLY_NAME: "idx00_ex10_assembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: "idx00_ex10_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" EXISTING_ASSEMBLY_ANALYSIS_ACCESSION: # The accession of the assembly analysis that all bins/MAGs originate from >>EXAMPLE: "GCA_012552665" EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION: 'SAMEA114745644' # The accession of the virtual sample of the co-assembly which all bins/MAGs originate from >>EXAMPLE: "ERZ21942150" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" diff --git a/examples/11_bins.yaml b/examples/11_bins.yaml index 2c2db0f..56f3c3c 100644 --- a/examples/11_bins.yaml +++ b/examples/11_bins.yaml @@ -11,7 +11,7 @@ SEQUENCING_PLATFORMS: ["ILLUMINA"] # PROJECT_NAME: "Project ex11 idx00" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey" SAMPLE_ACCESSIONS: ["SAMEA113417017"] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] ASSEMBLY: - ASSEMBLY_NAME: "idx00_ex11_assembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: "idx00_ex11_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" EXISTING_ASSEMBLY_ANALYSIS_ACCESSION: "ERZ21942150" # The accession of the assembly analysis that all bins/MAGs originate from >>EXAMPLE: "GCA_012552665" EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION: # The accession of the virtual sample of the co-assembly which all bins/MAGs originate from >>EXAMPLE: "ERZ21942150" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" diff --git a/examples/12_mags.yaml b/examples/12_mags.yaml index 88d1490..e6f2235 100644 --- a/examples/12_mags.yaml +++ b/examples/12_mags.yaml @@ -12,7 +12,7 @@ SEQUENCING_PLATFORMS: ["ILLUMINA"] # PROJECT_NAME: "idx00_ex12_proj" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey" SAMPLE_ACCESSIONS: ['SAMEA113417017','SAMEA113417018'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] ASSEMBLY: - ASSEMBLY_NAME: "idx00_ex12_assembly" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: "idx00_ex12_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" EXISTING_ASSEMBLY_ANALYSIS_ACCESSION: # The accession of the assembly analysis that all bins/MAGs originate from >>EXAMPLE: "GCA_012552665" EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION: 'SAMEA114745644' # The accession of the virtual sample of the co-assembly which all bins/MAGs originate from >>EXAMPLE: "ERZ21942150" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" diff --git a/setup.py b/setup.py index 72a7471..12aacc6 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def run(self): setup( name='synum', - version='0.9.0', + version='0.9.1', packages=find_packages(), install_requires=[ 'pysam>=0.19.1', diff --git a/synum/assemblySubmission.py b/synum/assemblySubmission.py index 97876cd..3c974ee 100644 --- a/synum/assemblySubmission.py +++ b/synum/assemblySubmission.py @@ -7,7 +7,7 @@ from requests.auth import HTTPBasicAuth from synum import utility, loggingC -from synum.utility import from_config +from synum.utility import from_config, stamped_from_config from synum.statConf import staticConfig from synum.webinWrapper import webin_cli @@ -20,21 +20,21 @@ def __prep_coassembly_samplesheet(config: dict, Args: config (dict): The configuration dictionary. outdir (str): The directory where the samplesheet will be written. - staticConfig (staticConfig, optional): The static configuration object. Defaults to staticConfig. + origin_samples: list of sample accessions that are being co-assembled Returns: str: The path to the samplesheet. """ loggingC.message(f">Preparing assembly samplesheet...", threshold=0) - sample_alias = from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') + sample_alias = stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') # Create XML tree root = ET.Element("SAMPLE_SET") sample = ET.SubElement(root, "SAMPLE", alias=sample_alias) title = ET.SubElement(sample, "TITLE") - title.text = from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME') + title.text = stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME') sample_name = ET.SubElement(sample, "SAMPLE_NAME") taxon_id = ET.SubElement(sample_name, "TAXON_ID") @@ -117,6 +117,7 @@ def __submit_coassembly_samplesheet(sample_xml: str, def __prep_assembly_manifest(config: dict, + logging_dir: str, outdir: str, depth_files, run_accessions, @@ -143,8 +144,10 @@ def __prep_assembly_manifest(config: dict, if depth_files is None: COVERAGE = utility.from_config(config, 'ASSEMBLY', 'COVERAGE_VALUE') else: + coverage_outfile = os.path.join(logging_dir, "assembly_coverage.txt") COVERAGE = utility.calculate_coverage(depth_files, - threads=threads) + outfile=coverage_outfile, + threads=threads) # Write manifest PLATFORM = utility.from_config(config, 'SEQUENCING_PLATFORMS') @@ -157,7 +160,7 @@ def __prep_assembly_manifest(config: dict, rows = [ [ 'STUDY', utility.from_config(config, 'STUDY') ], [ 'SAMPLE', sample_accession ], - [ 'ASSEMBLYNAME', utility.from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') ], + [ 'ASSEMBLYNAME', utility.stamped_from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') ], [ 'ASSEMBLY_TYPE', staticConfig.sequence_assembly_type ], [ 'COVERAGE', COVERAGE ], [ 'PROGRAM', utility.from_config(config, 'ASSEMBLY','ASSEMBLY_SOFTWARE') ], @@ -268,6 +271,7 @@ def submit_assembly(config: dict, os.makedirs(fasta_logging_dir, exist_ok=False) ## make a manifest and submit manifest_path = __prep_assembly_manifest(config, + logging_dir, fasta_submission_dir, depth_files, run_accessions, @@ -276,7 +280,7 @@ def submit_assembly(config: dict, threads=threads) loggingC.message(f">Using ENA Webin-CLI to submit assembly.", threshold=0) - assembly_name = utility.from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') usr, pwd = utility.get_login() receipt, accession = webin_cli(manifest=manifest_path, inputdir=fasta_submission_dir, diff --git a/synum/binSubmission.py b/synum/binSubmission.py index 85bc226..351efa8 100644 --- a/synum/binSubmission.py +++ b/synum/binSubmission.py @@ -11,125 +11,6 @@ from synum.webinWrapper import webin_cli from synum.statConf import staticConfig -# def __report_tax_issues(issues): -# """ -# Reports the issues encountered when trying to determine the taxonomy -# of metagenome bins automatically. - -# Args: -# issues (list): A list of dictionaries with the issues encountered. -# """ -# err = f"\nERROR: Unable to determine taxonomy for {len(issues)} bins:" -# loggingC.message(err, threshold=-1) -# problematic_bins = [x['mag_bin'] for x in issues] -# problematic_bins = set(problematic_bins) -# msg = '\n'.join(problematic_bins) -# loggingC.message(msg, threshold=0) -# msg = f"Please manually enter taxonomy data for these bins into a .tsv file and specify it in the MANUAL_TAXONOMY_FILE field in the config file." -# loggingC.message(msg, threshold=0) -# msg = f"\nTaxonomy issues are:" -# loggingC.message(msg, threshold=0) -# for i in issues: -# mag = i['mag_bin'] -# level = i['level'] -# classification = i['classification'] -# suggestions = i['suggestions'] -# if len(i['suggestions']) == 0: -# loggingC.message(f"MAG {mag} - no suggestions found (classified as {level} {classification})", threshold=0) -# else: -# loggingC.message(f"MAG {mag} - multiple suggestions found (classified as {level} {classification})", threshold=0) -# loggingC.message("\tSuggestions are:", threshold=0) -# for s in suggestions: -# loggingC.message(f"\t{s['taxId']}\t{s['scientificName']}", threshold=0) -# exit(1) - - -# def query_ena_taxonomy(level: str, -# domain: str, -# classification: str, -# filtered: bool = True) -> list: -# """ -# Based on the query string, use the ENA REST API to get a suggestion for the -# taxonomy. - -# Args: -# level (str): The taxonomic level of the query string. -# domain (str): The domain of the query string. -# classification (str): The query string. -# filtered (bool, optional): If True, only return the best suggestion. -# Defaults to True. - -# Returns: -# list: A list of dictionaries with the suggestions from the ENA REST API. -# """ -# # If we know the species we just query for that. -# if level == 'species': -# query = classification -# # If we know the genus get a " sp." name -# elif level == 'genus': -# query = f"{classification} sp." -# # If we only know the domain, we get " domain" -# elif level == 'domain': -# if domain == 'Archaea': -# query = 'uncultured archaeon' -# elif domain == 'Bacteria': -# query = 'uncultured bacterium' -# else: -# err = f"\nERROR: Encountered unknown domain {domain} for a mag bin. Please check your NCBI taxonomy files." -# loggingC.message(err, threshold=-1) -# exit(1) -# # If we know only something between domain and genus then we get -# # " bacterium" / " archeon" -# else: -# if domain == 'Archaea': -# dstring = 'archaeon' -# elif domain == 'Bacteria': -# dstring = 'bacterium' -# elif domain == 'metagenome': -# dstring = 'metagenome' -# else: -# err = f"\nERROR: Encountered unknown domain {domain}" -# loggingC.message(err, threshold=-1) -# exit(1) -# if domain == 'metagenome': -# query = classification -# else: -# query = f"{classification} {dstring}" - -# url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/suggest-for-submission/{query}" -# response = requests.get(url) -# if response.status_code == 200: -# suggestions = response.json() -# result = [] -# for suggestion in suggestions: -# tax_id = suggestion.get("taxId", "N/A") -# scientific_name = suggestion.get("scientificName", "N/A") -# display_name = suggestion.get("displayName", "N/A") -# taxdata = {"tax_id": tax_id, "scientificName": scientific_name, "displayName": display_name} -# # If we don't filter we want all results -# if not filtered: -# result.append(taxdata) -# # For genus level we want the "is species of this genus" result -# elif level == 'genus': -# if scientific_name.endswith('sp.'): -# result.append(taxdata) -# # For species, we want the result that is the species name (no subspecies) -# # It might have something like "Candidatus" in front so we don't exclude that -# elif level == 'species': -# if scientific_name.endswith(classification): -# result.append(taxdata) -# # For all other cases we want the " " taxon. -# else: -# if scientific_name.endswith('archaeon') or scientific_name.endswith('bacterium') or level == 'metagenome': -# result.append(taxdata) -# return result -# else: -# err = f"\nERROR: Trying to fetch taxonomy suggestion for {level}: {classification} (domain: {domain}) but ENA REST API returned status code {response.status_code}" -# loggingC.message(err, threshold=-1) -# loggingC.message(f"Attempted query was {url}", threshold=0) -# exit(1) - - def __calculate_bin_coverage(fasta: str, depth_files: list, @@ -161,277 +42,6 @@ def __calculate_bin_coverage(fasta: str, return coverage -# def __read_ncbi_taxonomy(ncbi_taxonomy_file: str) -> dict: -# """ -# Read the output of GTDB-TKs 'gtdb_to_ncbi_majority_vote.py' or a file using -# the format described in README.md and return a dictionary with a taxid and a -# scientific name for each genome. -# Important note: The output of GTDB-TKs 'gtdb_to_ncbi_majority_vote.py' script -# might miss some genome bins which didn't get GTDB classifications. - -# Args: -# ncbi_to_taxonomy_file (str): Output files of GTDB-TKs -# 'gtdb_to_ncbi_majority_vote.py' script or NCBI taxonomy file with a -# the columns 'Bin_id' and 'NCBI_taxonomy'. - -# Returns: -# dict: A dictionary with a taxid and a scientific name for each genome. -# """ -# result = {} -# with open(ncbi_taxonomy_file, 'r') as f: -# reader = csv.reader(f, delimiter='\t') -# header = next(reader) #skip header -# if header == staticConfig.gtdb_majority_vote_columns.split(';'): -# gtdb_majvote_output = True -# else: -# gtdb_majvote_output = False -# if gtdb_majvote_output: -# with open(ncbi_taxonomy_file, 'r') as f: -# reader = csv.reader(f, delimiter='\t') -# next(reader) #skip header -# for row in reader: -# mag_bin = row[0].strip() -# lowest_ncbi_name = row[2].strip().split(';') -# result[mag_bin] = lowest_ncbi_name -# else: -# with open(ncbi_taxonomy_file, 'r') as f: -# reader = csv.DictReader(f, delimiter='\t') -# for row in reader: -# mag_bin = row["Bin_id"].strip() -# taxonomy_string = row["NCBI_taxonomy"].strip() -# lowest_ncbi_name = taxonomy_string.split(';') -# result[mag_bin] = lowest_ncbi_name -# return result - - -# def __best_classifications(ncbi_classifications: dict) -> dict: -# """ -# For each bin, find the best classification string. This is the classification -# string on the lowest level that is not empty. - -# Args: -# ncbi_classifications (dict): A dictionary with a list of classification strings -# for each bin. - -# Returns: -# dict: A dictionary with the best classification string for each bin. -# """ -# result = {} -# levels = staticConfig.taxonomic_levels.split(';') -# for mag_bin, clist in ncbi_classifications.items(): -# # Check if we have an "unclassified" bin here -# if len(clist) == 1: -# clasf = clist[0] -# if clasf == 'Unclassified Bacteria': -# result[mag_bin] = { -# 'level': 'domain', -# 'classification': 'Bacteria', -# 'domain': 'Bacteria', -# } -# elif clasf == 'Unclassified Archaea': -# result[mag_bin] = { -# 'level': 'domain', -# 'classification': 'Archaea', -# 'domain': 'Archaea', -# } -# else: -# err = f"\nERROR: Found unclassified bin {mag_bin} with unknown classification {clasf}. Please check your NCBI taxonomy files." -# loggingC.message(err, threshold=-1) -# exit(1) -# loggingC.message(f">INFO: Bin {mag_bin} is unclassified.", threshold=1) -# else: -# # Iterate through classification strings until we find a valid one -# for cstring, level in zip(reversed(clist), levels): -# if len(cstring) < 4: # no classification on this level -# continue -# break -# # If this is unclassified we need to include some padding -# if cstring.startswith('Unclassified'): -# cstring = '___'+cstring -# clist[0] = '___'+clist[0] -# result[mag_bin] = { -# 'level': level, -# 'classification': cstring[3:], -# 'domain': clist[0][3:] -# } -# return result - - -# def __taxonomic_classification(ncbi_taxonomy_files: list) -> dict: -# """ -# Read the output of GTDB-TKs 'gtdb_to_ncbi_majority_vote.py' script or -# a similarly structured file and return a dictionary with a taxid and a -# scientific name for each genome. - -# Args: -# ncbi_to_taxonomy_file (str): Output files of GTDB-TKs -# 'gtdb_to_ncbi_majority_vote.py' script or NCBI taxonomy file with a -# similar structure. - -# Returns: -# dict: A dictionary with a taxid and a scientific name for each genome. -# """ -# all_classifications = {} -# for f in ncbi_taxonomy_files: -# ncbi_classifications = __read_ncbi_taxonomy(f) -# best_ncbi_classifications = __best_classifications(ncbi_classifications) -# all_classifications.update(best_ncbi_classifications) -# return all_classifications - - -# def __read_manual_taxonomy_file(manual_taxonomy_file: str) -> dict: -# """ -# Read a manual taxonomy file and return a dictionary with the taxid and -# scientific name for each bin. - -# Args: -# manual_taxonomy_file (str): Path to the manual taxonomy file. - -# Returns: -# dict: A dictionary with the taxid and scientific name for each bin. -# """ -# result = {} -# with open(manual_taxonomy_file, 'r') as f: -# reader = csv.DictReader(f, delimiter='\t') -# for row in reader: -# mag_bin = row['Bin_id'].strip() -# tax_id = row['Tax_id'].strip() -# scientific_name = row["Scientific_name"].strip() -# result[mag_bin] = { -# 'tax_id': tax_id, -# 'scientific_name': scientific_name, -# } -# return result - -# def get_bin_taxonomy(config) -> dict: -# """ -# Based on the NCBI taxonomy files and the manual taxonomy file, create a -# dictionary with the taxid and scientific name for each bin. - -# Args: -# config (dict): The config dictionary. - -# Returns: -# dict: A dictionary with the taxid and scientific name for each bin. -# """ -# loggingC.message(">Reading in bin taxonomy data", threshold=0) -# # Extract data from config -# ncbi_taxonomy_files = utility.optional_from_config(config, 'BINS', 'NCBI_TAXONOMY_FILES') -# if ncbi_taxonomy_files == '': # Key not found in config -# ncbi_taxonomy_files = [] -# loggingC.message(">No NCBI taxonomy files found in config.", threshold=0) -# if not type(ncbi_taxonomy_files) == list: -# ncbi_taxonomy_files = [ncbi_taxonomy_files] -# bins_directory = utility.from_config(config, 'BINS', 'BINS_DIRECTORY') - -# # Make a list of all files in the bins directory -# files = os.listdir(bins_directory) -# bin_basenames = [] -# for f in files: -# basename = utility.is_fasta(os.path.join(bins_directory, f)) -# if basename: -# bin_basenames.append(basename) - -# # Read the taxnomy data from the MANUAL_TAXONOMY_FILE if it exists -# upload_taxonomy_data = {} -# try: -# manual_taxonomy_file = utility.from_config(config, -# 'BINS', -# 'MANUAL_TAXONOMY_FILE', -# supress_errors=True) -# loggingC.message(f">Reading manual taxonomy file {os.path.basename(manual_taxonomy_file)}", threshold=0) -# except: -# manual_taxonomy_file = 'manual_taxonomy_file_doesnt_exist' -# loggingC.message(">No manual taxonomy file found.", threshold=0) -# if os.path.exists(manual_taxonomy_file): -# if not manual_taxonomy_file == 'manual_taxonomy_file_doesnt_exist': -# upload_taxonomy_data = __read_manual_taxonomy_file(manual_taxonomy_file) - -# # Make a dictionary for the taxonomies based on the taxonomy files -# annotated_bin_taxonomies = __taxonomic_classification(ncbi_taxonomy_files) - - -# # Make sure that for each bin showing up in the taxonomy files we have a -# # corresponding fasta file in bin_files -# from_taxonomies = set(upload_taxonomy_data.keys()) -# from_taxonomies.update(set(annotated_bin_taxonomies.keys())) -# bin_basenames_set = set(bin_basenames) -# only_bin_basenames = bin_basenames_set.difference(from_taxonomies) -# only_taxonomies = from_taxonomies.difference(bin_basenames_set) -# if len(only_bin_basenames) > 0: -# err = f"\nERROR: The following bins were found in the bins directory but not in the taxonomy files:" -# loggingC.message(err, threshold=-1) -# for b in only_bin_basenames: -# msg = f"\t{b}" -# loggingC.message(msg, threshold=-0) -# exit(1) -# if len(only_taxonomies) > 0: -# err = f"\nERROR: The following bins were found in the taxonomy files but not in the bins directory:" -# loggingC.message(err, threshold=-1) -# for b in only_taxonomies: -# msg = f"\t{b}" -# loggingC.message(msg, threshold=-1) -# exit(1) -# only_fasta = set(bin_basenames) - from_taxonomies -# if len(only_fasta) > 0: -# err = f"\nERROR: Tt in the taxonomy files:" -# for b in only_fasta: -# msg = f"\t{b}" -# loggingC.message(msg, threshold=-1) -# exit(1) - -# # Query the ENA API for taxids and scientific names for each bin -# loggingC.message(">Querying ENA for taxids and scientific names for each bin.", threshold=0) - -# issues = [] -# for bin_name, taxonomy in tqdm(annotated_bin_taxonomies.items(), leave=False): -# if bin_name in upload_taxonomy_data: -# loggingC.message(f">INFO: Bin {bin_name} was found in the manual taxonomy file and will be skipped.", threshold=1) -# continue -# suggestions = query_ena_taxonomy(taxonomy['level'], -# taxonomy['domain'], -# taxonomy['classification'], -# filtered=True) -# if len(suggestions) == 1: -# upload_taxonomy_data[bin_name] = { -# 'scientific_name': suggestions[0]['scientificName'], -# 'tax_id': suggestions[0]['tax_id'], -# } -# else: -# all_ena_suggestions = query_ena_taxonomy(taxonomy['level'], -# taxonomy['domain'], -# taxonomy['classification'], -# filtered=False) - -# issues.append({ -# 'mag_bin': bin_name, -# 'level': taxonomy['level'], -# 'classification': taxonomy['classification'], -# 'suggestions': all_ena_suggestions, -# }) - -# # Add any bins that only show up in the files to the issues as unclassified -# for basename in bin_basenames: -# if not basename in upload_taxonomy_data: -# is_issue = False -# for i in issues: -# if i['mag_bin'] == basename: -# is_issue = True -# if is_issue: -# continue -# issues.append({ -# 'mag_bin': basename, -# 'level': 'unclassified', -# 'classification': 'unclassified', -# 'suggestions': [], -# }) - -# if len(issues) > 0: -# __report_tax_issues(issues) -# exit(1) - -# return upload_taxonomy_data - def get_bin_quality(config, silent=False) -> dict: """ Based on CheckM output (or any other tsv with the columns 'Bin Id', @@ -542,7 +152,7 @@ def __prep_bins_samplesheet(config: dict, for bin_id in upload_taxonomy_data.keys(): - assembly_name = utility.from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') sample_alias = f"{assembly_name}_bin_{bin_id}_virtual_sample" sample_title = f"{assembly_name}_bin_{bin_id}_virtual_sample" @@ -738,7 +348,7 @@ def __prep_bin_manifest(config: dict, if isinstance(run_accessions, list): run_accessions = ",".join(run_accessions) - assembly_name = utility.from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME') assembly_name = assembly_name.replace(' ','_') bin_assembly_name = f"{assembly_name}_bin_{bin_sample_accession}" if len(bin_assembly_name) > staticConfig.max_assembly_name_length: @@ -812,8 +422,9 @@ def __stage_bin_submission(staging_directory: str, def bin_coverage_from_depth(depth_files: str, - bin_name_to_fasta: dict, - threads: int = 4) -> dict: + bin_name_to_fasta: dict, + outfile: str=None, + threads: int = 4) -> dict: """ Calculate coverage for each bin from depth files. @@ -825,13 +436,28 @@ def bin_coverage_from_depth(depth_files: str, Returns: dict: Dictionary mapping bin names to coverage values. """ - loggingC.message(">Calculating coverage for each bin from depth files.", threshold=0) + msg = ">Calculating coverage for each bin from depth files." + loggingC.message(msg, threshold=0) + msg = f">A coverage file will be written to {outfile}\n You can use it to " \ + " provide a KNOWN_COVERAGE_FILE instead of BAM_FILES in the config " \ + " if you need to run the submission process again." + if outfile: + loggingC.message(msg, threshold=0) + bin_coverages = {} for bin_name, bin_fasta in tqdm(bin_name_to_fasta.items(), leave=False): coverage = __calculate_bin_coverage(bin_fasta, depth_files, threads=threads) bin_coverages[bin_name] = coverage + + if outfile: + with open(outfile, 'w') as f: + writer = csv.writer(f, delimiter='\t') + writer.writerow(['Bin_id', 'Coverage']) + for bin_name, coverage in bin_coverages.items(): + writer.writerow([bin_name, coverage]) + return bin_coverages @@ -941,9 +567,11 @@ def submit_bins(config: dict, # Get the coverage for each bin file loggingC.message(">Deriving bin coverage", threshold=1) + coverage_outfile = os.path.join(logging_dir, 'bin_coverages.tsv') if not depth_files is None: bin_coverages = bin_coverage_from_depth(depth_files, bin_name_to_fasta, + coverage_outfile, threads=threads) elif not bin_coverage_file is None: bin_coverages = bin_coverage_from_tsv(bin_coverage_file, @@ -968,7 +596,7 @@ def submit_bins(config: dict, samples_logging_dir, url) # Remove the prefixes - assembly_name = utility.from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') prefix_len = len(f"{assembly_name}_bin_") suffix_len= len(f"_virtual_sample") bin_to_accession = {} @@ -1003,7 +631,7 @@ def submit_bins(config: dict, bin_logging_dir = os.path.join(logging_dir, f"{bin_name}") os.makedirs(bin_logging_dir, exist_ok=False) bin_manifest = bin_manifests[bin_name] - assembly_name = utility.from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') subdir_name = assembly_name + '_' + bin_name bin_receipts[bin_name], bin_accessions[bin_name] = webin_cli(manifest=bin_manifest, diff --git a/synum/enaSearching.py b/synum/enaSearching.py index f8e2d62..3025bd2 100644 --- a/synum/enaSearching.py +++ b/synum/enaSearching.py @@ -5,23 +5,21 @@ def study_exists(study_accession: str, - testmode: bool) -> bool: + devserver: bool) -> bool: """ Check if a study with the input accession exists in ENA. Args: study_accession (str): The study accession. - testmode (bool): Whether to use the test server. + devserver (bool): Whether to use the test server. Returns: bool: True if the study exists, False if not. """ - if testmode: - #url = "https://wwwdev.ebi.ac.uk/ena/portal/api/search" - url = staticConfig.ena_search_url - else: - #url = "https://www.ebi.ac.uk/ena/portal/api/search" + if devserver: url = staticConfig.ena_test_search_url + else: + url = staticConfig.ena_search_url params = { "query": f"study_accession={study_accession}", "result": "study", @@ -38,29 +36,22 @@ def study_exists(study_accession: str, return False -def sample_exists(sample_accession: str, - testmode: bool) -> bool: +def sample_accession_exists(sample_accession: str, + devserver: bool = False) -> bool: """ Check if a sample with the input accession exists in ENA. Args: sample_accession (str): The sample accession. - testmode (bool): Whether to use the test server. + devserver (bool): Whether to use the test server. Returns: bool: True if the sample exists, False if not. """ - - #msg = f"\WARNING: Checking sample accessions is suspended because of issues with the ENA API." - #loggingC.message(msg, threshold=0) - #return True - - if testmode: - #url = "https://wwwdev.ebi.ac.uk/ena/portal/api/search" - url = staticConfig.ena_search_url - else: - #url = "https://www.ebi.ac.uk/ena/portal/api/search" + if devserver: url = staticConfig.ena_test_search_url + else: + url = staticConfig.ena_search_url params = { "query": f"sample_accession={sample_accession}", "result": "sample", @@ -76,8 +67,102 @@ def sample_exists(sample_accession: str, return False +def sample_alias_accession(sample_alias: str, + study_accession: str, + devserver: bool) -> bool: + """ + Check if a sample with the input alias exists in ENA. Return the accession + if it does or None if it does not. + + Args: + sample_alias (str): The sample alias. + study_accession (str): The study accession. + devserver (bool): Whether to use the test server. + """ + if devserver: + url = staticConfig.ena_test_search_url + else: + url = staticConfig.ena_search_url + params = { + "query": f"sample_alias={sample_alias} AND study_accession={study_accession}", + "result": "sample", + "fields": "sample_accession" + } + response = requests.get(url, params=params) + try: + data = response.text.split('\n')[1] + except: + data = None + if data == '': + data = None + return data + + +def sample_title_accession(sample_title: str, + study_accession: str, + devserver: bool) -> bool: + """ + Check if a sample with the input title exists in ENA. Return the accession + if it does or None if it does not. + + Args: + sample_title (str): The sample title. + study_accession (str): The study accession. + devserver (bool): Whether to use the test server. + """ + if devserver: + url = staticConfig.ena_test_search_url + else: + url = staticConfig.ena_search_url + params = { + "query": f"sample_title={sample_title} AND study_accession={study_accession}", + "result": "sample", + "fields": "sample_accession" + } + response = requests.get(url, params=params) + try: + data = response.text.split('\n')[1] + except: + data = None + if data == '': + data = None + return data + + + +def run_alias_accession(run_alias: str, + study_accession: str, + devserver: bool) -> bool: + """ + Check if a run with the input name exists in ENA. Return the accession + if it does or None if it does not. + + Args: + run_alias (str): The run name. + study_accession (str): The study accession. + devserver (bool): Whether to use the test server. + """ + if devserver: + url = staticConfig.ena_test_search_url + else: + url = staticConfig.ena_search_url + params = { + "query": f"run_alias={run_alias} AND study_accession={study_accession}", + "result": "read_run", + "fields": "run_accession" + } + response = requests.get(url, params=params) + try: + data = response.text.split('\n')[1] + except: + data = None + if data == '': + data = None + return data + + def search_samples_by_assembly_analysis(assembly_analysis_accession: str, - testmode: bool) -> list: + devserver: bool) -> list: """ Get a list of sample accessions for a given assembly analysis accession. @@ -87,21 +172,24 @@ def search_samples_by_assembly_analysis(assembly_analysis_accession: str, Returns: str: A single sample accession. """ - if testmode: - #url = "https://wwwdev.ebi.ac.uk/ena/portal/api/search" - url = staticConfig.ena_search_url - else: - #url = "https://www.ebi.ac.uk/ena/portal/api/search" + if devserver: url = staticConfig.ena_test_search_url + else: + url = staticConfig.ena_search_url params = { "query": f"analysis_accession={assembly_analysis_accession}", "result": "analysis", "fields": "sample_accession" } response = requests.get(url, params=params) + print("INPUT: ", assembly_analysis_accession) + print("RESPONSE: ", response.text) - sample_accession = response.text.split('\n')[1:-1][0] - sample_accession = sample_accession.split('\t')[0] + try: + sample_accession = response.text.split('\n')[1:-1][0] + sample_accession = sample_accession.split('\t')[0] + except: + return None if ',' in sample_accession: loggingC.message(f"\nERROR: Multiple sample accessions found for assembly analysis {assembly_analysis_accession}:\n{sample_accession}", threshold=-1) @@ -111,7 +199,7 @@ def search_samples_by_assembly_analysis(assembly_analysis_accession: str, def search_scientific_name_by_sample(sample_accession: str, - testmode: bool) -> str: + devserver: bool) -> str: """ Get the scientific name for a given sample accession. @@ -121,19 +209,16 @@ def search_scientific_name_by_sample(sample_accession: str, Returns: str: The scientific name of the sample. """ - if testmode: - #url = "https://wwwdev.ebi.ac.uk/ena/portal/api/search" - url = staticConfig.ena_search_url - else: - #url = "https://www.ebi.ac.uk/ena/portal/api/search" + if devserver: url = staticConfig.ena_test_search_url + else: + url = staticConfig.ena_search_url params = { "query": f"sample_accession={sample_accession}", "result": "sample", "fields": "scientific_name" } response = requests.get(url, params=params) - try: scientific_name = response.text.split('\n')[1:-1][0] scientific_name = scientific_name.split('\t')[0] @@ -148,11 +233,24 @@ def search_scientific_name_by_sample(sample_accession: str, return scientific_name -# For debugging -#print(sample_exists('SAMEA113417025', True)) -#print(sample_exists('ERS28162653', True)) -#print(study_exists("PRJEB71644", True)) -#print(search_scientific_name_by_sample('ERS28140038', True)) -#print(search_scientific_name_by_sample("SAMEA114749859", True)) -#print(search_samples_by_assembly_analysis('ERZ1049590')) -#print(search_runs_by_sample('SAMEA113417025')) \ No newline at end of file +if __name__ == "__main__": + # For debugging + print("DEBUG: Running sample_accession_exists('SAMEA113417025',False)") + print(sample_accession_exists('SAMEA113417025',False)) + print("DEBUG: Running sample_accession_exists('ERS28162653', False)") + print(sample_accession_exists('ERS28162653', False)) + print("DEBUG: Running study_exists('PRJEB71644', False)") + print(study_exists("PRJEB71644", False)) + print("DEBUG: Running search_scientific_name_by_sample('SAMEA114749859', False)") + print(search_scientific_name_by_sample("SAMEA114749859", False)) + print("DEBUG: Running search_samples_by_assembly_analysis('ERZ1049590', False)") + print(search_samples_by_assembly_analysis('ERZ1049590', False)) + print("DEBUG: Running sample_alias_accession('bgp35_d1a', 'PRJEB39821', False)") + print(sample_alias_accession('bgp35_d1a', 'PRJEB39821', False)) + print("DEBUG: Running sample_title_accession('bgp35_digester_1_a', 'PRJEB39821', False)") + print(sample_title_accession('bgp35_digester_1_a', 'PRJEB39821', False)) + print("DEBUG: Running read_name_accession('BGP350_Hc_deepseq', 'PRJEB39821', False)") + print(run_alias_accession('BGP350_Hc_deepseq', 'PRJEB39821', False)) + + print("UUUUUUUUUND") + print(search_scientific_name_by_sample('SAMEA114745644', False)) \ No newline at end of file diff --git a/synum/magSubmission.py b/synum/magSubmission.py index 88c60a0..af87277 100644 --- a/synum/magSubmission.py +++ b/synum/magSubmission.py @@ -91,7 +91,7 @@ def __prep_mags_samplesheet(config: dict, #completeness_software = utility.from_config(config, 'BINS', 'COMPLETENESS_SOFTWARE') binning_software = utility.from_config(config, 'BINS', 'BINNING_SOFTWARE') binning_parameters = utility.from_config(config, 'BINS', 'ADDITIONAL_SAMPLESHEET_FIELDS', 'binning parameters') - project_name = utility.from_config(config, 'PROJECT_NAME') + project_name = utility.stamped_from_config(config, 'PROJECT_NAME') taxonomic_identity_marker = utility.from_config(config, 'BINS', 'ADDITIONAL_SAMPLESHEET_FIELDS', @@ -104,7 +104,7 @@ def __prep_mags_samplesheet(config: dict, env_context_broad = utility.from_config(config, 'ASSEMBLY', 'ADDITIONAL_SAMPLESHEET_FIELDS','broad-scale environmental context') env_context_local = utility.from_config(config, 'ASSEMBLY', 'ADDITIONAL_SAMPLESHEET_FIELDS','local environmental context') env_medium = utility.from_config(config, 'ASSEMBLY', 'ADDITIONAL_SAMPLESHEET_FIELDS','environmental medium') - assembly_name = utility.from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') derived_from = ",".join([x['accession'] for x in sample_accession_data]) @@ -246,9 +246,22 @@ def __stage_mag_submission(metadata, coverage: float, run_accessions: list) -> str: """ + Stages all files needed for MAG submission and writes a MANIFEST file. + + Args: + metadata (dict): The metadata for the MAG. + staging_directory (str): The directory where the files will be staged. + mag_id (str): The id of the MAG. + config (dict): The config dictionary. + mag_sample_accession (str): The sample accession for the MAG. + coverage (float): The coverage of the MAG. + run_accessions (list): A list of the run accessions. + + Returns: + str: The path to the MANIFEST file. """ # Prepare some data - assembly_name = utility.from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME') assembly_name = assembly_name.replace(' ', '_') mag_assembly_name = f"{assembly_name}_{mag_sample_accession}" @@ -416,7 +429,7 @@ def submit_mags(config: dict, # Remove the prefiexes - assembly_name = utility.from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') prefix_len = len(f"{assembly_name}_MAG_") suffix_len = len("_virtual_sample") mag_to_accession = {} @@ -454,7 +467,7 @@ def submit_mags(config: dict, mag_logging_dir = os.path.join(logging_dir, f"{mag_id}") os.makedirs(mag_logging_dir, exist_ok=False) mag_manifest = mag_manifests[mag_id] - assembly_name = utility.from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') + assembly_name = utility.stamped_from_config(config, 'ASSEMBLY','ASSEMBLY_NAME') subdir_name = assembly_name + '_' + mag_id mag_receipts[mag_id], mag_accessions[mag_id] = webinWrapper.webin_cli(manifest=mag_manifest, inputdir=mag_staging_dir, diff --git a/synum/main.py b/synum/main.py index 4215f53..f1986e4 100644 --- a/synum/main.py +++ b/synum/main.py @@ -58,8 +58,8 @@ def main(): type=int, choices=[0, 1], default=1, - help="Make submissions to the ENA dev test " - "server. [default 1/true]") + help="Make submissions to the ENA development " + "test server. [default 1/true]") parser_submit.add_argument("-t", "--threads", type=int, @@ -105,6 +105,15 @@ def main(): action="store_true", default=False, help="Skip preflight checks. Use with caution.") + parser_submit.add_argument("-z", "--timestamps", + type=int, + choices=[0, 1], + help="Add timestamps to the names of submitted " + "items. This prevents name clashes when " + "submitting the same data multiple times during " + "testing. Defaults to true when using the " + "development_service, defaults to false " + "otherwise.") parser_makecfg = subparsers.add_parser('makecfg', help='Create a .yml file ' @@ -195,16 +204,18 @@ def main(): elif args.mode == 'submit': loggingC.set_up_logging(args.logging_dir, args.verbosity) + if args.timestamps or (args.timestamps is None and args.development_service): + utility.set_up_timestamps(vars(args)) try: sver = staticConfig.synum_version wver = staticConfig.webin_cli_version loggingC.message(f">Running synum {sver} with webin-cli {wver}", 0) if args.development_service == 1: - loggingC.message((">Initializing a test submission to" \ + loggingC.message((">Initializing a test submission to " \ "the ENA dev server."), 0) else: - loggingC.message((">Initializing a LIVE SUBMISSION to" \ + loggingC.message((">Initializing a LIVE SUBMISSION to " \ "the ENA production server."), 0) time.sleep(5) @@ -334,8 +345,15 @@ def main(): if args.submit_assembly: metagenome_scientific_name = utility.from_config(config, 'METAGENOME_SCIENTIFIC_NAME') else: - metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, - args.development_service) + try: + metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, + args.development_service) + except: + # This is a workaround because I keep getting + # false negatives on the development server + # for samples that exist on both servers. + metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, + False) submit_mags(config, metagenome_scientific_name, sample_accession_data, diff --git a/synum/preflight.py b/synum/preflight.py index 5e84cad..41d9251 100644 --- a/synum/preflight.py +++ b/synum/preflight.py @@ -9,6 +9,7 @@ import time import csv +checks_failed = False def __check_tsv(tsvfile: str, required_columns: list): @@ -29,9 +30,9 @@ def __check_tsv(tsvfile: str, header = f.readline().strip().split('\t') for col in required_columns: if col not in header: - err = f"\nERROR: The .tsv file '{tsvfile}' is missing the column '{col}'." + err = f"\nWARNING: The .tsv file '{tsvfile}' is missing the column '{col}'." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True if 'Bin_ids' in header: bin_ids = [] with open(tsvfile, 'r') as f: @@ -60,6 +61,7 @@ def __check_fields(items: list, category_name (str, optional): The name of the category being checked. Only used for error messages. Defaults to "item". """ + global checks_failed if not isinstance(items, list): items = [items] for item in items: @@ -68,18 +70,18 @@ def __check_fields(items: list, if not optional: err = f"\nERROR: A '{field}' field is missing in the {category_name} section (or one of the items in this section)." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True elif item[field] is None or item[field] == '': if not optional: err = f"\nERROR: A '{field}' field is empty in the {category_name} section (or one of the items in this section)." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True elif not isinstance(item[field], field_type): found_type = type(item[field]).__name__ desired_type = field_type.__name__ err = f"\nERROR: The '{field}' field in the {category_name} section (or one of the items in this section) is not of the correct type (type is <{found_type}> but should be <{desired_type}>)." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True def __check_date(date: str): @@ -89,6 +91,7 @@ def __check_date(date: str): Args: date: The date string to check. """ + global checks_failed valid_formats = [ "%Y", # Year only "%Y-%m", # Year-month @@ -108,7 +111,7 @@ def __check_date(date: str): if not valid: err = f"\nERROR: The date '{date}' is not a valid ISO date string. Please check your config file." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True def __check_country_sea_location(location: str): @@ -118,11 +121,12 @@ def __check_country_sea_location(location: str): Args: location: The location string to check. """ + global checks_failed valid_locations = staticConfig.valid_locations.split(';') if not location in valid_locations: err = f"\nERROR: The location '{location}' is not a valid country or sea name. Please check your config file." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True def __check_study(config: dict, @@ -134,14 +138,25 @@ def __check_study(config: dict, config: The config file as a dictionary. testmode: Whether or not to use the ENA development server. """ + global checks_failed study_accession = utility.from_config(config, 'STUDY') if not enaSearching.study_exists(study_accession, testmode): if testmode: - err = f"\nERROR: The study accession '{study_accession}' does not exist on the ENA development server which you are trying to submit to. If you created it on the regular server, it can take up to 24 hours until it shows up on the development server." + if not enaSearching.study_exists(study_accession, False): + wrn = f"\nWARNING: The study accession '{study_accession}' " \ + "cannot be found on the ENA server. This might be okay " \ + "if you just created this study on the development " \ + "server. If that is the case, consider using " \ + "--skip_checks" + loggingC.message(wrn, threshold=-1) + checks_failed = True else: - err = f"\nERROR: The study accession '{study_accession}' does not exist on the ENA production server which you are trying to submit to. Did you create it on the development server?" - loggingC.message(err, threshold=-1) - exit(1) + err = f"\nERROR: The study accession '{study_accession}' does not " \ + "exist on the ENA production server which you are trying to " \ + "submit to. Did you create it only on the development " \ + "server?" + loggingC.message(err, threshold=-1) + checks_failed = True def __check_samples(arguments: dict, @@ -153,6 +168,7 @@ def __check_samples(arguments: dict, arguments: The command line arguments. config: The config file as a dictionary. """ + global checks_failed if not arguments['submit_samples']: return # Check data in SAMPLES section @@ -160,13 +176,33 @@ def __check_samples(arguments: dict, if len(sample_items) == 0: err = f"\nERROR: You chose to submit samples, but did not provide any sample data." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True mandatory_fields = [('collection date', str), ('geographic location (country and/or sea)', str)] __check_fields(sample_items, mandatory_fields, category_name="NEW_SAMPLE") for s in sample_items: __check_date(s['collection date']) __check_country_sea_location(s['geographic location (country and/or sea)']) + # Are all sample titles unique + titles = [] + study = utility.from_config(config, 'STUDY') + testmode = arguments['development_service'] + for sample in sample_items: + titles.append(sample['TITLE']) + if len(titles) != len(set(titles)): + err = f"\nERROR: The TITLEs in the SAMPLES section are not unique." + loggingC.message(err, threshold=-1) + checks_failed = True + # Does one of the sample titles already exist in ENA + for title in titles: + if enaSearching.sample_title_accession(title, study, False) or enaSearching.sample_title_accession(title, study, testmode): + err = f"\nERROR: The sample title '{title}' was provided in the samples section but already exists on the ENA server as a sample title." + loggingC.message(err, threshold=-1) + checks_failed = True + if enaSearching.sample_alias_accession(title, study, False) or enaSearching.sample_title_accession(title, study, testmode): + err = f"\nERROR: The sample title '{title}' was provided in the samples section but already exists on the ENA server as a sample alias." + loggingC.message(err, threshold=-1) + checks_failed = True def __check_read_type(paired: bool, @@ -183,6 +219,7 @@ def __check_read_type(paired: bool, arguments: The command line arguments. testmode: Whether or not to use the ENA development server. """ + global checks_failed if paired: read_type = 'paired-end reads' else: @@ -205,12 +242,21 @@ def __check_read_type(paired: bool, else: mandatory_fields.append(('RELATED_SAMPLE_ACCESSION', str),) + read_aliases = [] for s in read_items: # Check if all fields are present and not empty __check_fields(read_items, mandatory_fields, category_name=read_type) - + # Check if the read name already exists as aliases in ENA + read_alias = s['NAME'] + read_aliases.append(read_alias) + study = utility.from_config(config, 'STUDY') + testmode = arguments['development_service'] + if enaSearching.run_alias_accession(read_alias, study, False) or enaSearching.run_alias_accession(read_alias, study, testmode): + err = f"\nERROR: The NAME '{read_alias}' was provided in the reads section but already exists on the ENA server as a run alias." + loggingC.message(err, threshold=-1) + checks_failed = True # Check if sample accessions are coherent if arguments['submit_samples']: sample_title = s['RELATED_SAMPLE_TITLE'] @@ -221,14 +267,20 @@ def __check_read_type(paired: bool, if sample_title not in titles: err = f"\nERROR: The sample title '{sample_title}' was provided in the reads section but not matching sample exist in the NEW_SAMPLES section." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True else: sample_accession = s['RELATED_SAMPLE_ACCESSION'] # Do the samples exist in ENA - if not enaSearching.sample_exists(sample_accession, testmode): - err = f"\nERROR: The sample accession '{sample_accession}' was provided in the reads section but does not exist on the ENA server." - loggingC.message(err, threshold=-1) - exit(1) + if not enaSearching.sample_accession_exists(sample_accession, False): + if not enaSearching.sample_accession_exists(sample_accession, testmode): + if testmode: + wrn = f"\nWARNING: The sample accession '{sample_accession}' cannot be found on the ENA server. This might be okay if you just created it on the development server. Consider using --skip_checks" + loggingC.message(wrn, threshold=-1) + checks_failed = True + else: + err = f"\nERROR: The sample accession '{sample_accession}' was provided in the reads section but does not exist on the ENA server." + loggingC.message(err, threshold=-1) + checks_failed = True # Is it the same samples from the sample_accessions field if 'SAMPLE_ACCESSIONS' in config: sample_accessions = utility.from_config(config, 'SAMPLE_ACCESSIONS') @@ -237,9 +289,13 @@ def __check_read_type(paired: bool, if not sample_accession in sample_accessions: err = f"\nERROR: The sample accession '{sample_accession}' was provided in the reads section but does not exist in the SAMPLE_ACCESSIONS field." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True - + # Check if all aliases are unique + if len(read_aliases) != len(set(read_aliases)): + err = f"\nERROR: The NAMEs in the reads section are not unique." + loggingC.message(err, threshold=-1) + checks_failed = True # Check if the FASTQ file exists and has a valid file extension if paired: @@ -295,6 +351,7 @@ def __check_misc(arguments: dict, arguments: The command line arguments. config: The config file as a dictionary. """ + global checks_failed # Sequencing Platforms if arguments['submit_assembly'] or arguments['submit_bins'] or arguments['submit_mags']: sequencing_platforms = utility.from_config(config, 'SEQUENCING_PLATFORMS') @@ -305,11 +362,11 @@ def __check_misc(arguments: dict, if not platform in valid_platforms: err = f"\nERROR: The sequencing platform '{platform}' is not valid. Valid platforms are: {'|'.join(staticConfig.valid_sequencing_platforms.split(';'))}" loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True # Project Name if arguments['submit_mags']: - utility.from_config(config, 'PROJECT_NAME') + utility.stamped_from_config(config, 'PROJECT_NAME') # Metagenome taxonomy if arguments['submit_assembly']: @@ -322,30 +379,35 @@ def __check_misc(arguments: dict, '{metagenome_scientific_name}' but could not be mapped \ to an NCBI tax id." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True else: msg = f"Matched METAGENOME_SCIENTIFIC_NAME \ '{metagenome_scientific_name}' to NCBI tax id {ena_taxid}." loggingC.message(msg, threshold=1) -def __check_assembly_name(assembly_name): +def __check_assembly_name(arguments: dict, + assembly_name: str): """ Check if the assembly name is too long. The submission limit is 50 characters, but webin cli will introduce add a prefix before submission. Args: + arguments: The command line arguments. assembly_name: The assembly name read from the config. """ + global checks_failed if not isinstance(assembly_name, str): err = f"\nERROR: Please provide a string as the assembly name (found {type(assembly_name).__name__})." loggingC.message(err, threshold=-1) exit(1) max_length = staticConfig.max_assembly_name_length + if arguments['development_service'] or arguments['timestamps']: + max_length -= staticConfig.timestamp_length if len(assembly_name) > max_length: err = f"\nERROR: The assembly name '{assembly_name}' is too long. The maximum length is {max_length} characters." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True def __check_assembly(arguments: dict, @@ -359,13 +421,15 @@ def __check_assembly(arguments: dict, config: The config file as a dictionary. testmode: Whether or not to use the ENA development server. """ + global checks_failed + if testmode: servertype = "development" else: servertype = "production" assembly_data = utility.from_config(config, 'ASSEMBLY') - __check_assembly_name(assembly_data['ASSEMBLY_NAME']) + __check_assembly_name(arguments, assembly_data['ASSEMBLY_NAME']) if arguments['submit_assembly']: @@ -389,7 +453,7 @@ def __check_assembly(arguments: dict, gz_extensions = [f"{ext}.gz" for ext in valid_extensions] err = f"\nERROR: The assembly fasta file '{assembly_data['FASTA_FILE']}' does not exist or does not have a valid file extensions. Valid extensions are {'|'.join(valid_extensions)} and {'|'.join(gz_extensions)}." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True else: biological_sample_accessions = utility.from_config(config, 'SAMPLE_ACCESSIONS') @@ -401,7 +465,9 @@ def __check_assembly(arguments: dict, 'ASSEMBLY', 'EXISTING_ASSEMBLY_ANALYSIS_ACCESSION') if not assembly_analysis_accession is None or assembly_analysis_accession == '': - resulting_accession = enaSearching.search_samples_by_assembly_analysis(assembly_analysis_accession, testmode) + resulting_accession = enaSearching.search_samples_by_assembly_analysis(assembly_analysis_accession, False) + if not resulting_accession: + resulting_accession = enaSearching.search_samples_by_assembly_analysis(assembly_analysis_accession, testmode) if not len(biological_sample_accessions) == 1: err = f"\nERROR: When providing an existing assembly analysis accession, you need to provide exactly one biological sample accession in the SAMPLE_ACCESSIONS field. If the assembly stems from 2 or more samples, please provide a co-assembly accession instead." loggingC.message(err, threshold=-1) @@ -411,16 +477,27 @@ def __check_assembly(arguments: dict, if resulting_accession is None and 'EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION' in assembly_data: sample_accessions = utility.optional_from_config(config, 'ASSEMBLY', 'EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION') if not sample_accessions is None or sample_accessions == '': - if not enaSearching.sample_exists(sample_accessions, testmode): - err = f"\nERROR: The co-assembly sample accession '{sample_accessions}' could not be found on the {servertype} ENA server." - loggingC.message(err, threshold=-1) - exit(1) + if not enaSearching.sample_accession_exists(sample_accessions, False): + if not enaSearching.sample_accession_exists(sample_accessions, testmode): + if testmode: + wrn = f"\nWARNING: The co-assembly sample " \ + "accession '{sample_accessions}' cannot " \ + "be found on the ENA server. This might be " \ + "okay if you just created it on the " \ + "development server. Consider using " \ + "--skip_checks" + loggingC.message(wrn, threshold=-1) + checks_failed = True + else: + err = f"\nERROR: The co-assembly sample accession '{sample_accessions}' could not be found on the {servertype} ENA server." + loggingC.message(err, threshold=-1) + checks_failed = True else: resulting_accession = sample_accessions if len(biological_sample_accessions) < 2: err = f"\nERROR: When providing an existing co-assembly sample accession, you need to provide at least two biological sample accessions in the SAMPLE_ACCESSIONS field." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True else: resulting_accession = None if (not 'EXISTING_ASSEMBLY_ANALYSIS_ACCESSION' in assembly_data) and (not 'EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION' in assembly_data): @@ -444,6 +521,7 @@ def __check_bins(arguments: dict, config: The config file as a dictionary. testmode: Whether or not to use the ENA development server. """ + global checks_failed if not arguments['submit_bins'] and not arguments['submit_mags']: return @@ -464,7 +542,7 @@ def __check_bins(arguments: dict, if quality_file is None or quality_file == '': err = f"\nERROR: No QUALITY_FILE was provided in the BINS section." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True __check_tsv(quality_file, staticConfig.bin_quality_columns.split(';')) @@ -490,7 +568,7 @@ def __check_bins(arguments: dict, if not item in header: err = f"\nERROR: The taxonomy file '{tax_file}' needs to have one of the following two sets of columns:\n{'|'.join(gtdb_majority_vote_columns)}\n{'|'.join(ncbi_taxonomy_columns)}" loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True if 'MANUAL_TAXONOMY_FILE' in bin_data.keys(): if isinstance(bin_data['MANUAL_TAXONOMY_FILE'], list): err = f"\nERROR: In the BINS section, provide only one MANUAL_TAXONOMY_FILE please" @@ -501,7 +579,7 @@ def __check_bins(arguments: dict, err = f"\nERROR: Ane empty string was provided as MANUAL_TAXONOMY_FILE in the BINS section." err += f"\nPlease provide a valid file path or remove the field from the config file." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True __check_tsv(bin_data['MANUAL_TAXONOMY_FILE'], staticConfig.manual_taxonomy_columns.split(';')) tax_files.append(bin_data['MANUAL_TAXONOMY_FILE']) # And if the headers of the MANUAL_TAXONOMY_FILE are correct @@ -531,7 +609,7 @@ def __check_bins(arguments: dict, if not found: err = f"\nERROR: The bins directory '{bins_directory}' does not contain any fasta files." loggingC.message(err, threshold=-1) - exit(1) + checks_failed = True # Check if the required arguments in ASSEMBLY section are present assembly_data = utility.from_config(config, 'ASSEMBLY') @@ -558,7 +636,7 @@ def __check_mags(arguments: dict, return # Check project name - utility.from_config(config, 'PROJECT_NAME') + utility.stamped_from_config(config, 'PROJECT_NAME') # Check the metadata file metadata_file = utility.from_config(config, 'MAGS', 'MAG_METADATA_FILE') @@ -701,6 +779,9 @@ def preflight_checks(arguments: dict) -> None: Returns: The config file as a dictionary. """ + msg = f">Running preflight checks. You can skip these by using the --skip_checks flag." + loggingC.message(msg, threshold=0) + # Check if config file exists if not os.path.isfile(arguments['config']): err = f"\nERROR: The config file '{arguments['config']}' does not exist." @@ -746,7 +827,16 @@ def preflight_checks(arguments: dict) -> None: loggingC.message(err, threshold=-1) exit(1) - msg = f">All preflight checks passed." - loggingC.message(msg, threshold=0) + if checks_failed: + msg = f"Some preflight checks failed. If you are sure that the data " \ + "you provided is correct, you can skip these checks by using " \ + "the --skip_checks flag. If any ERROR messages are not " \ + "adressed, they are likely to cause failure after partial " \ + "submission." + loggingC.message(msg, threshold=-1) + exit(1) + else: + msg = f">All preflight checks passed." + loggingC.message(msg, threshold=0) return config diff --git a/synum/readSubmission.py b/synum/readSubmission.py index ac7c17a..acdfe8a 100644 --- a/synum/readSubmission.py +++ b/synum/readSubmission.py @@ -2,7 +2,7 @@ import csv from synum import loggingC, utility -from synum.utility import from_config +from synum.utility import from_config, stamped_from_config from synum.statConf import staticConfig from synum.webinWrapper import webin_cli @@ -15,10 +15,23 @@ def __prep_reads_manifest(config: dict, staging_dir: str, fastq1_path: str, fastq2_path: str) -> str: + """ + Prepare the manifest file for reads submission. + + Args: + config (dict): The configuration dictionary. + sample_accession_data (list): Contains one dictionary with information + on each sample. Dict keys are 'accession', 'external_accession', + and 'alias'. + data (dict): dictionary containing the reads data. + staging_dir (str): The directory where the reads are staged. + fastq1_path (str): The path to the first fastq file. + fastq2_path (str): The path to the second fastq file (if paired-end). + """ # Find the related sample if 'RELATED_SAMPLE_TITLE' in data.keys(): - sample_title = from_config(data, 'RELATED_SAMPLE_TITLE') + sample_title = stamped_from_config(data, 'RELATED_SAMPLE_TITLE') sample = None for sd in sample_accession_data: if sd['alias'] == sample_title: @@ -27,6 +40,7 @@ def __prep_reads_manifest(config: dict, if sample is None: err = f"\nERROR: No related sample found for the reads with title '{sample_title}'. Please check the configuration file." loggingC.message(err, threshold=-1) + exit(1) elif 'RELATED_SAMPLE_ACCESSION' in data.keys(): sample = from_config(data, 'RELATED_SAMPLE_ACCESSION') else: @@ -37,7 +51,7 @@ def __prep_reads_manifest(config: dict, rows = [ ['STUDY', from_config(config, 'STUDY')], ['SAMPLE', sample], - ['NAME', from_config(data, 'NAME')], + ['NAME', stamped_from_config(data, 'NAME')], ['INSTRUMENT', from_config(data, 'SEQUENCING_INSTRUMENT')], ['LIBRARY_SOURCE', from_config(data, 'LIBRARY_SOURCE')], ['LIBRARY_SELECTION', from_config(data, 'LIBRARY_SELECTION')], @@ -87,8 +101,21 @@ def __stage_reads_submission(config: dict, staging_dir: str, logging_dir: str) -> str: """ + Stage the reads for submission. + + Args: + config (dict): The configuration dictionary. + sample_accession_data (list): Contains one dictionary with information + on each sample. Dict keys are 'accession', 'external_accession', + and 'alias'. + data (dict): dictionary containing the reads data. + staging_dir (str): The directory where the reads will be staged. + logging_dir (str): The directory where the submission logs will be + written. + + Returns: + str: The path to the manifest file. """ - # Stage the fastq file(s) gzipped_fastq1_path = os.path.join(staging_dir, 'reads_1' + staticConfig.zipped_fastq_extension) gzipped_fastq2_path = os.path.join(staging_dir, 'reads_2' + staticConfig.zipped_fastq_extension) @@ -140,7 +167,7 @@ def submit_reads(config, if 'PAIRED_END_READS' in config.keys(): loggingC.message(">Staging paired-end reads for submission. This might take a while.", threshold=0) for i, data in enumerate(from_config(config, 'PAIRED_END_READS')): - name = from_config(data, 'NAME').replace(' ', '_') + name = stamped_from_config(data, 'NAME').replace(' ', '_') read_set_staging_dir = os.path.join(staging_dir, f"reads_{name}") os.makedirs(read_set_staging_dir, exist_ok=False) read_set_logging_dir = os.path.join(logging_dir, f"reads_{name}") @@ -158,7 +185,7 @@ def submit_reads(config, loggingC.message(">Staging single-end reads for submission. This might take a while.", threshold=0) for j, data in enumerate(from_config(config, 'SINGLE_READS')): i = counter + j - name = from_config(data, 'NAME').replace(' ', '_') + name = stamped_from_config(data, 'NAME').replace(' ', '_') read_set_staging_dir = os.path.join(staging_dir, f"reads_{name}") os.makedirs(read_set_staging_dir, exist_ok=False) read_set_logging_dir = os.path.join(logging_dir, f"reads_{name}") diff --git a/synum/sampleSubmission.py b/synum/sampleSubmission.py index 5cd406c..e43d9ef 100644 --- a/synum/sampleSubmission.py +++ b/synum/sampleSubmission.py @@ -1,7 +1,7 @@ import os from synum import loggingC, utility -from synum.utility import from_config +from synum.utility import from_config, stamped_from_config from synum.statConf import staticConfig import requests @@ -27,7 +27,7 @@ def __prep_samplesheet(config: dict, metagenome_taxid = from_config(config, 'METAGENOME_TAXID') for data in samples_data: - tree_sample = ET.SubElement(tree_root, 'SAMPLE', alias=from_config(data, 'TITLE')) + tree_sample = ET.SubElement(tree_root, 'SAMPLE', alias=stamped_from_config(data, 'TITLE')) tree_title = ET.SubElement(tree_sample, 'TITLE') tree_title.text = from_config(data, 'TITLE') diff --git a/synum/statConf.py b/synum/statConf.py index 329165d..e9f9c3e 100644 --- a/synum/statConf.py +++ b/synum/statConf.py @@ -2,7 +2,7 @@ @dataclass class staticConfig: - synum_version: str = '0.9.0' + synum_version: str = '0.9.1' webin_cli_version: str = '7.0.1' ena_dropbox_url: str = 'https://www.ebi.ac.uk/ena/submit/drop-box/submit/' ena_test_dropbox_url: str = 'https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit/' @@ -40,6 +40,7 @@ class staticConfig: mag_qstring_high: str = 'Multiple fragments where gaps span repetitive regions. Presence of the 23S, 16S and 5S rRNA genes and at least 18 tRNAs.' mag_qstring_medium: str = 'Many fragments with little to no review of assembly other than reporting of standard assembly statistics.' max_assembly_name_length: int = 50 - len('webin-genome-' + '_SAMEA________') + timestamp_length: int = 4 ena_rest_rate_limit: int = 50 # requests per second submission_modes_message: str = """ The following modes of submission are supported: diff --git a/synum/taxQuery.py b/synum/taxQuery.py index efe0eb1..48dcaad 100644 --- a/synum/taxQuery.py +++ b/synum/taxQuery.py @@ -369,7 +369,7 @@ def get_bin_taxonomy(config) -> dict: # Read data from NCBI_TAXONOMY_FILES ncbi_taxonomy_files = utility.optional_from_config(config, 'BINS', 'NCBI_TAXONOMY_FILES') - if ncbi_taxonomy_files == '': # Key not found in config + if ncbi_taxonomy_files == None: # Key not found in config ncbi_taxonomy_files = [] loggingC.message(">No NCBI taxonomy files found in config.", threshold=0) if not type(ncbi_taxonomy_files) == list: diff --git a/synum/utility.py b/synum/utility.py index 2e0b406..953944e 100644 --- a/synum/utility.py +++ b/synum/utility.py @@ -14,6 +14,30 @@ from synum.statConf import staticConfig +# Global variable for timestamping data that is pulled from the config +timestamp = None +keys_to_stamp = [] + + +def set_up_timestamps(arguments: dict): + """ + Set up the timestamp for the submission. Data is only timestamped with + the hour and minute. Because of daily resets, this is sufficient to + prevent name clashes on the development server. + """ + global timestamp + global keys_to_stamp + keys_to_stamp = [ + "PROJECT_NAME", + "NAME", + "TITLE", + "RELATED_SAMPLE_TITLE" + ] + if arguments['submit_assembly']: + keys_to_stamp.append("ASSEMBLY_NAME") + timestamp = time.strftime("%H%M") + + def construct_depth_files(staging_dir: str, threads: int, bam_files: list) -> dict: @@ -180,23 +204,37 @@ def from_config(config, key, subkey=None, subsubkey=None, supress_errors=False): """ Extracts a value from the dict that was created based on the config YAML file. + + Args: + config (dict): The dict created from the config YAML file. + key (str): The key to extract from the dict. + subkey (str): The nested key to extract from the key. + subsubkey (str): The nested key to extract from the subkey. + supress_errors (bool): If True, missing keys will not cause an exit + but will instead return None. """ if not key in config: if not supress_errors: err = f"\nERROR: The field '{key}' is missing from the config YAML file." loggingC.message(err, threshold=-1) exit(1) + else: + return None if not config[key]: if not supress_errors: err = f"\nERROR: The field '{key}' is empty in the config YAML file." loggingC.message(err, threshold=-1) exit(1) + else: + return None if subkey: if not subkey in config[key]: if not supress_errors: err = f"\nERROR: The field '{key}|{subkey}' is missing from the config YAML file." loggingC.message(err, threshold=-1) exit(1) + else: + return None if not config[key][subkey]: if not supress_errors: err = f"\nERROR: The field '{key}|{subkey}' is empty in the config YAML file." @@ -208,37 +246,54 @@ def from_config(config, key, subkey=None, subsubkey=None, supress_errors=False): err = f"\nERROR: The field '{key}|{subkey}|{subsubkey}' is missing from the config YAML file." loggingC.message(err, threshold=-1) exit(1) + else: + return None if not config[key][subkey][subsubkey]: if not supress_errors: err = f"\nERROR: The field '{key}|{subkey}|{subsubkey}' is empty in the config YAML file." loggingC.message(err, threshold=-1) exit(1) + else: + return None return __strcast(config[key][subkey][subsubkey]) return __strcast(config[key][subkey]) return __strcast(config[key]) def optional_from_config(config, key, subkey=None, subsubkey=None): + """ + Calls from config but returns None if the key is missing. + + Args: + config (dict): The dict created from the config YAML file. + key (str): The key to extract from the dict. + subkey (str): The nested key to extract from the key. + subsubkey (str): The nested key to extract from the subkey. + """ try: return from_config(config, key, subkey, subsubkey, supress_errors=True) except: - return '' - -# def calculate_assembly_length(fasta_file): -# """ -# Calculate the total length of the assembly from a FASTA file. + return None + +def stamped_from_config(config, key, subkey=None, subsubkey=None): + """ + Calls from config but adds a timestamp to relevant fields if timestamping + is activated. -# Args: -# fasta_file (str): File path to the FASTA file. Can be gzipped. + Args: + config (dict): The dict created from the config YAML file. + key (str): The key to extract from the dict. + subkey (str): The nested key to extract from the key. + subsubkey (str): The nested key to extract from the subkey. + """ + global timestamp + global keys_to_stamp + lowest_key = subsubkey or subkey or key -# Returns: -# int: Total length of the assembly. -# """ -# total_length = 0 -# with pysam.FastaFile(fasta_file) as fasta: -# for seq in fasta.references: -# total_length += fasta.get_reference_length(seq) + value = from_config(config, key, subkey, subsubkey) + if timestamp and (lowest_key in keys_to_stamp): + value = f"{timestamp}{value}" + return value -# return total_length def check_fastq(fastq_filepath: str): """ @@ -400,6 +455,7 @@ def contigs_coverage(depth_file): def calculate_coverage(depth_files: list, target_contigs: set = None, threads=4, + outfile=None, silent=False): """ Calculate the average coverage of an assembly based on multiple depth files using parallel processing. @@ -435,7 +491,11 @@ def process_file(depth_file): else: threshold = 0 loggingC.message(">Calculating coverage from depth files. This might take a while.", threshold) - + msg = f">The coverage value will be written to {outfile} in case you " \ + " want to provide a KNOWN_COVERAGE in the ASSEMBLY section " \ + " in the config for subsequent submission attempts." + if outfile: + loggingC.message(msg, threshold=0) inuse = min(threads, len(depth_files)) with yaspin(text=f"Processing with {inuse} threads...\t", color="yellow") as spinner: @@ -449,7 +509,11 @@ def process_file(depth_file): average_coverage = total_coverage / total_length if total_length > 0 else 0 if not silent: - loggingC.message(f"\t...coverage is {str(average_coverage)}", threshold+1) + loggingC.message(f"\t...coverage is {str(average_coverage)}", threshold=0) + + if outfile: + with open(outfile, 'w') as f: + f.write(str(average_coverage)) return average_coverage