From b64231c8a13944b8a183a5ad086d1718ea8040ee Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Thu, 15 Jul 2021 20:27:51 +0100 Subject: [PATCH 01/75] Initial split of v2 ANIb code into `anib.py` and `aniblastall.py` files Including changing the parameter `mode` to `method` --- pyani/anib.py | 171 ++++++--------------- pyani/aniblastall.py | 355 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 398 insertions(+), 128 deletions(-) diff --git a/pyani/anib.py b/pyani/anib.py index 80b2947c..89848e49 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -224,7 +224,7 @@ def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict: def make_blastcmd_builder( - mode: str, + method: str, outdir: Path, format_exe: Optional[Path] = None, blast_exe: Optional[Path] = None, @@ -232,32 +232,21 @@ def make_blastcmd_builder( ) -> BLASTcmds: """Return BLASTcmds object for construction of BLAST commands. - :param mode: str, the kind of ANIb analysis (ANIb or ANIblastall) + :param method: str, the kind of ANI analysis (ANIb) :param outdir: :param format_exe: :param blast_exe: :param prefix: """ - if mode == "ANIb": # BLAST/formatting executable depends on mode - blastcmds = BLASTcmds( - BLASTfunctions(construct_makeblastdb_cmd, construct_blastn_cmdline), - BLASTexes( - format_exe or pyani_config.MAKEBLASTDB_DEFAULT, - blast_exe or pyani_config.BLASTN_DEFAULT, - ), - prefix, - outdir, - ) - else: - blastcmds = BLASTcmds( - BLASTfunctions(construct_formatdb_cmd, construct_blastall_cmdline), - BLASTexes( - format_exe or pyani_config.FORMATDB_DEFAULT, - blast_exe or pyani_config.BLASTALL_DEFAULT, - ), - prefix, - outdir, - ) + blastcmds = BLASTcmds( + BLASTfunctions(construct_makeblastdb_cmd, construct_blastn_cmdline), + BLASTexes( + format_exe or pyani_config.MAKEBLASTDB_DEFAULT, + blast_exe or pyani_config.BLASTN_DEFAULT, + ), + prefix, + outdir, + ) return blastcmds @@ -271,9 +260,6 @@ def make_job_graph( :param fragfiles: list of paths to fragmented input FASTA files :param blastcmds: - By default, will run ANIb - it *is* possible to make a mess of passing the - wrong executable for the mode you're using. - All items in the returned graph list are BLAST executable jobs that must be run *after* the corresponding database creation. The Job objects corresponding to the database creation are contained as dependencies. @@ -321,24 +307,19 @@ def generate_blastdb_commands( filenames: List[Path], outdir: Path, blastdb_exe: Optional[Path] = None, - mode: str = "ANIb", ) -> List[Tuple[str, Path]]: - """Return list of makeblastdb command-lines for ANIb/ANIblastall. + """Return list of makeblastdb command-lines for ANIb. :param filenames: a list of paths to input FASTA files :param outdir: path to output directory :param blastdb_exe: path to the makeblastdb executable - :param mode: str, ANIb analysis type (ANIb or ANIblastall) + :param method: str, ANIb analysis type (ANIb) """ - if mode == "ANIb": - construct_db_cmdline = construct_makeblastdb_cmd - else: - construct_db_cmdline = construct_formatdb_cmd if blastdb_exe is None: - cmdlines = [construct_db_cmdline(fname, outdir) for fname in filenames] + cmdlines = [construct_makeblastdb_cmd(fname, outdir) for fname in filenames] else: cmdlines = [ - construct_db_cmdline(fname, outdir, blastdb_exe) for fname in filenames + construct_makeblastdb_cmd(fname, outdir, blastdb_exe) for fname in filenames ] return cmdlines @@ -360,58 +341,38 @@ def construct_makeblastdb_cmd( ) -# Generate single makeblastdb command line -def construct_formatdb_cmd( - filename: Path, outdir: Path, blastdb_exe: Path = pyani_config.FORMATDB_DEFAULT -) -> Tuple[str, Path]: - """Return formatdb command and path to output file. - - :param filename: Path, input filename - :param outdir: Path, path to output directory - :param blastdb_exe: Path, path to the formatdb executable - """ - newfilename = outdir / filename.name - shutil.copy(filename, newfilename) - return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename) - - # Generate list of BLASTN command lines from passed filenames def generate_blastn_commands( filenames: List[Path], outdir: Path, blast_exe: Optional[Path] = None, - mode: str = "ANIb", ) -> List[str]: - """Return a list of blastn command-lines for ANIm. + """Return a list of blastn command-lines for ANIb. :param filenames: a list of paths to fragmented input FASTA files :param outdir: path to output directory :param blastn_exe: path to BLASTN executable - :param mode: str, analysis type (ANIb or ANIblastall) + :param method: str, analysis type (ANIb) Assumes that the fragment sequence input filenames have the form ACCESSION-fragments.ext, where the corresponding BLAST database filenames have the form ACCESSION.ext. This is the convention followed by the fragment_FASTA_files() function above. """ - if mode == "ANIb": - construct_blast_cmdline = construct_blastn_cmdline # type: Callable - else: - construct_blast_cmdline = construct_blastall_cmdline cmdlines = [] for idx, fname1 in enumerate(filenames[:-1]): dbname1 = Path(str(fname1).replace("-fragments", "")) for fname2 in filenames[idx + 1 :]: dbname2 = Path(str(fname2).replace("-fragments", "")) if blast_exe is None: - cmdlines.append(construct_blast_cmdline(fname1, dbname2, outdir)) - cmdlines.append(construct_blast_cmdline(fname2, dbname1, outdir)) + cmdlines.append(construct_blastn_cmdline(fname1, dbname2, outdir)) + cmdlines.append(construct_blastn_cmdline(fname2, dbname1, outdir)) else: cmdlines.append( - construct_blast_cmdline(fname1, dbname2, outdir, blast_exe) + construct_blastn_cmdline(fname1, dbname2, outdir, blast_exe) ) cmdlines.append( - construct_blast_cmdline(fname2, dbname1, outdir, blast_exe) + construct_blastn_cmdline(fname2, dbname1, outdir, blast_exe) ) return cmdlines @@ -440,33 +401,12 @@ def construct_blastn_cmdline( ) -# Generate single BLASTALL command line -def construct_blastall_cmdline( - fname1: Path, - fname2: Path, - outdir: Path, - blastall_exe: Path = pyani_config.BLASTALL_DEFAULT, -) -> str: - """Return single blastall command. - - :param fname1: - :param fname2: - :param outdir: - :param blastall_exe: str, path to BLASTALL executable - """ - prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}" - return ( - f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {fname1} -d {fname2} " - "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" - ) - - # Process pairwise BLASTN output def process_blast( blast_dir: Path, org_lengths: Dict, fraglengths: Dict, - mode: str = "ANIb", + method: str = "ANIb", logger: Optional[Logger] = None, ) -> ANIResults: """Return tuple of ANIb results for .blast_tab files in the output dir. @@ -475,7 +415,7 @@ def process_blast( :param org_lengths: Dict, the base count for each input sequence :param fraglengths: dictionary of query sequence fragment lengths, only needed for BLASTALL output - :param mode: str, analysis type (ANIb or ANIblastall) + :param method: str, analysis type (ANIb) :param logger: a logger for messages Returns the following pandas dataframes in an ANIResults object; @@ -492,7 +432,7 @@ def process_blast( # Process directory to identify input files blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab") # Hold data in ANIResults object - results = ANIResults(list(org_lengths.keys()), mode) + results = ANIResults(list(org_lengths.keys()), method) # Fill diagonal NA values for alignment_length with org_lengths for org, length in list(org_lengths.items()): @@ -521,7 +461,7 @@ def process_blast( blastfile, ) continue - resultvals = parse_blast_tab(blastfile, fraglengths, mode) + resultvals = parse_blast_tab(blastfile, fraglengths, method) query_cover = float(resultvals[0]) / org_lengths[qname] # Populate dataframes: when assigning data, we need to note that @@ -535,15 +475,13 @@ def process_blast( # Parse BLASTALL output to get total alignment length and mismatches -def parse_blast_tab( - filename: Path, fraglengths: Dict, mode: str = "ANIb" -) -> Tuple[int, int, int]: +def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: """Return (alignment length, similarity errors, mean_pid) tuple. :param filename: Path, path to .blast_tab file :param fraglengths: Optional[Dict], dictionary of fragment lengths for each genome. - :param mode: str, analysis type (ANIb or ANIblastall) + :param method: str, analysis type (ANIb) Calculate the alignment length and total number of similarity errors (as we would with ANIm), as well as the Goris et al.-defined mean identity @@ -556,41 +494,23 @@ def parse_blast_tab( over an alignable region of at least 70% of their length. ''' """ - # Assuming that the filename format holds org1_vs_org2.blast_tab: - qname = filename.stem.split("_vs_")[0] # Load output as dataframe - if mode == "ANIblastall": - qfraglengths = fraglengths[qname] - columns = [ - "sid", - "blast_pid", - "blast_alnlen", - "blast_mismatch", - "blast_gaps", - "q_start", - "q_end", - "s_start", - "s_end", - "e_Value", - "bit_score", - ] - else: - columns = [ - "sbjct_id", - "blast_alnlen", - "blast_mismatch", - "blast_pid", - "blast_identities", - "qlen", - "slen", - "q_start", - "q_end", - "s_start", - "s_end", - "blast_pos", - "ppos", - "blast_gaps", - ] + columns = [ + "sbjct_id", + "blast_alnlen", + "blast_mismatch", + "blast_pid", + "blast_identities", + "qlen", + "slen", + "q_start", + "q_end", + "s_start", + "s_end", + "blast_pos", + "ppos", + "blast_gaps", + ] # We may receive an empty BLASTN output file, if there are no significant # regions of homology. This causes pandas to throw an error on CSV import. # To get past this, we create an empty dataframe with the appropriate @@ -600,11 +520,6 @@ def parse_blast_tab( data.columns = columns except pd.io.common.EmptyDataError: data = pd.DataFrame(columns=columns) - # Add new column for fragment length, only for BLASTALL - if mode == "ANIblastall": - data["qlen"] = pd.Series( - [qfraglengths[idx] for idx in data.index], index=data.index - ) # Add new columns for recalculated alignment length, proportion, and # percentage identity data["ani_alnlen"] = data["blast_alnlen"] - data["blast_gaps"] diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index a6c0a345..964e3ee5 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -38,11 +38,18 @@ import platform import re import subprocess +import shutil +import pandas as pd from pathlib import Path +from typing import List, Tuple, Dict, Optional + +from Bio import SeqIO from . import pyani_config +from . import pyani_jobs +from .pyani_tools import BLASTcmds, BLASTfunctions, BLASTexes def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str: @@ -72,3 +79,351 @@ def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str: r"(?<=blastall\s)[0-9\.]*", str(result.stderr, "utf-8") ).group() return f"{platform.system()}_{version}" + + +# Divide input FASTA sequences into fragments +def fragment_fasta_files( + infiles: List[Path], outdirname: Path, fragsize: int +) -> Tuple[List, Dict]: + """Chop sequences of the passed files into fragments, return filenames. + + :param infiles: collection of paths to each input sequence file + :param outdirname: Path, path to output directory + :param fragsize: Int, the size of sequence fragments + + Takes every sequence from every file in infiles, and splits them into + consecutive fragments of length fragsize, (with any trailing sequences + being included, even if shorter than fragsize), writing the resulting + set of sequences to a file with the same name in the specified + output directory. + + All fragments are named consecutively and uniquely (within a file) as + fragNNNNN. Sequence description fields are retained. + + Returns a tuple ``(filenames, fragment_lengths)`` where ``filenames`` is a + list of paths to the fragment sequence files, and ``fragment_lengths`` is + a dictionary of sequence fragment lengths, keyed by the sequence files, + with values being a dictionary of fragment lengths, keyed by fragment + IDs. + """ + outfnames = [] + for fname in infiles: + outfname = outdirname / f"{fname.stem}-fragments{fname.suffix}" + outseqs = [] + count = 0 + for seq in SeqIO.parse(fname, "fasta"): + idx = 0 + while idx < len(seq): + count += 1 + newseq = seq[idx : idx + fragsize] + newseq.id = "frag%05d" % count + outseqs.append(newseq) + idx += fragsize + outfnames.append(outfname) + SeqIO.write(outseqs, outfname, "fasta") + return outfnames, get_fraglength_dict(outfnames) + + +# Get lengths of all sequences in all files +def get_fraglength_dict(fastafiles: List[Path]) -> Dict: + """Return dictionary of sequence fragment lengths, keyed by query name. + + :param fastafiles: list of paths to FASTA input whole sequence files + + Loops over input files and, for each, produces a dictionary with fragment + lengths, keyed by sequence ID. These are returned as a dictionary with + the keys being query IDs derived from filenames. + """ + fraglength_dict = {} + for filename in fastafiles: + qname = filename.stem.split("-fragments")[0] + fraglength_dict[qname] = get_fragment_lengths(filename) + return fraglength_dict + + +# Get lengths of all sequences in a file +def get_fragment_lengths(fastafile: Path) -> Dict: + """Return dictionary of sequence fragment lengths, keyed by fragment ID. + + :param fastafile: + + Biopython's SeqIO module is used to parse all sequences in the FASTA + file. + + NOTE: ambiguity symbols are not discounted. + """ + fraglengths = {} + for seq in SeqIO.parse(fastafile, "fasta"): + fraglengths[seq.id] = len(seq) + return fraglengths + + +# Create dictionary of database building commands, keyed by dbname +def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict: + """Return dictionary of db-building commands, keyed by dbname. + + :param infiles: + :param blastcmds: + """ + dbjobdict = {} # Dict of database construction jobs, keyed by filename + # Create dictionary of database building jobs, keyed by db name + # defining jobnum for later use as last job index used + for idx, fname in enumerate(infiles): + dbjobdict[blastcmds.get_db_name(fname)] = pyani_jobs.Job( + f"{blastcmds.prefix}_db_{idx:06}", blastcmds.build_db_cmd(fname) + ) + return dbjobdict + + +def make_blastcmd_builder( + method: str, + outdir: Path, + format_exe: Optional[Path] = None, + blast_exe: Optional[Path] = None, + prefix: str = "ANIBLAST", +) -> BLASTcmds: + """Return BLASTcmds object for construction of BLAST commands. + + :param method: str, the kind of ANI analysis (ANIblastall) + :param outdir: + :param format_exe: + :param blast_exe: + :param prefix: + """ + blastcmds = BLASTcmds( + BLASTfunctions(construct_formatdb_cmd, construct_blastall_cmdline), + BLASTexes( + format_exe or pyani_config.FORMATDB_DEFAULT, + blast_exe or pyani_config.BLASTALL_DEFAULT, + ), + prefix, + outdir, + ) + return blastcmds + + +# Make a dependency graph of BLAST commands +def make_job_graph( + infiles: List[Path], fragfiles: List[Path], blastcmds: BLASTcmds +) -> List[pyani_jobs.Job]: + """Return job dependency graph, based on the passed input sequence files. + + :param infiles: list of paths to input FASTA files + :param fragfiles: list of paths to fragmented input FASTA files + :param blastcmds: + + All items in the returned graph list are BLAST executable jobs that must + be run *after* the corresponding database creation. The Job objects + corresponding to the database creation are contained as dependencies. + How those jobs are scheduled depends on the scheduler (see + run_multiprocessing.py, run_sge.py) + """ + joblist = [] # Holds list of job dependency graphs + + # Get dictionary of database-building jobs + dbjobdict = build_db_jobs(infiles, blastcmds) + + # Create list of BLAST executable jobs, with dependencies + jobnum = len(dbjobdict) + for idx, fname1 in enumerate(fragfiles[:-1]): + for fname2 in fragfiles[idx + 1 :]: + jobnum += 1 + jobs = [ + pyani_jobs.Job( + f"{blastcmds.prefix}_exe_{jobnum:06d}_a", + blastcmds.build_blast_cmd( + fname1, fname2.parent / fname2.name.replace("-fragments", "") + ), + ), + pyani_jobs.Job( + f"{blastcmds.prefix}_exe_{jobnum:06d}_b", + blastcmds.build_blast_cmd( + fname2, fname1.parent / fname1.name.replace("-fragments", "") + ), + ), + ] + jobs[0].add_dependency( + dbjobdict[fname1.parent / fname1.name.replace("-fragments", "")] + ) + jobs[1].add_dependency( + dbjobdict[fname2.parent / fname2.name.replace("-fragments", "")] + ) + joblist.extend(jobs) + + # Return the dependency graph + return joblist + + +# Generate list of makeblastdb command lines from passed filenames +def generate_blastdb_commands( + filenames: List[Path], + outdir: Path, + blastdb_exe: Optional[Path] = None, +) -> List[Tuple[str, Path]]: + """Return list of makeblastdb command-lines for ANIblastall. + + :param filenames: a list of paths to input FASTA files + :param outdir: path to output directory + :param blastdb_exe: path to the makeblastdb executable + :param method: str, ANI analysis type (ANIblastall) + """ + if blastdb_exe is None: + cmdlines = [construct_formatdb_cmd(fname, outdir) for fname in filenames] + else: + cmdlines = [ + construct_formatdb_cmd(fname, outdir, blastdb_exe) for fname in filenames + ] + return cmdlines + + +# Generate single makeblastdb command line +def construct_formatdb_cmd( + filename: Path, outdir: Path, blastdb_exe: Path = pyani_config.FORMATDB_DEFAULT +) -> Tuple[str, Path]: + """Return formatdb command and path to output file. + + :param filename: Path, input filename + :param outdir: Path, path to output directory + :param blastdb_exe: Path, path to the formatdb executable + """ + newfilename = outdir / filename.name + shutil.copy(filename, newfilename) + return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename) + + +# Generate list of BLASTN command lines from passed filenames +def generate_blastn_commands( + filenames: List[Path], + outdir: Path, + blast_exe: Optional[Path] = None, +) -> List[str]: + """Return a list of blastn command-lines for ANIblastall. + + :param filenames: a list of paths to fragmented input FASTA files + :param outdir: path to output directory + :param blastn_exe: path to BLASTN executable + :param method: str, analysis type (ANIblastall) + + Assumes that the fragment sequence input filenames have the form + ACCESSION-fragments.ext, where the corresponding BLAST database filenames + have the form ACCESSION.ext. This is the convention followed by the + fragment_FASTA_files() function above. + """ + cmdlines = [] + for idx, fname1 in enumerate(filenames[:-1]): + dbname1 = Path(str(fname1).replace("-fragments", "")) + for fname2 in filenames[idx + 1 :]: + dbname2 = Path(str(fname2).replace("-fragments", "")) + if blast_exe is None: + cmdlines.append(construct_blastall_cmdline(fname1, dbname2, outdir)) + cmdlines.append(construct_blastall_cmdline(fname2, dbname1, outdir)) + else: + cmdlines.append( + construct_blastall_cmdline(fname1, dbname2, outdir, blast_exe) + ) + cmdlines.append( + construct_blastall_cmdline(fname2, dbname1, outdir, blast_exe) + ) + return cmdlines + + +# Generate single BLASTALL command line +def construct_blastall_cmdline( + fname1: Path, + fname2: Path, + outdir: Path, + blastall_exe: Path = pyani_config.BLASTALL_DEFAULT, +) -> str: + """Return single blastall command. + + :param fname1: + :param fname2: + :param outdir: + :param blastall_exe: str, path to BLASTALL executable + """ + prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}" + return ( + f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {fname1} -d {fname2} " + "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" + ) + + +# Parse BLASTALL output to get total alignment length and mismatches +def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: + """Return (alignment length, similarity errors, mean_pid) tuple. + + :param filename: Path, path to .blast_tab file + :param fraglengths: Optional[Dict], dictionary of fragment lengths for each + genome. + :param method: str, analysis type (ANIblastall) + + Calculate the alignment length and total number of similarity errors (as + we would with ANIm), as well as the Goris et al.-defined mean identity + of all valid BLAST matches for the passed BLASTALL alignment .blast_tab + file. + + '''ANI between the query genome and the reference genome was calculated as + the mean identity of all BLASTN matches that showed more than 30% overall + sequence identity (recalculated to an identity along the entire sequence) + over an alignable region of at least 70% of their length. + ''' + """ + # Assuming that the filename format holds org1_vs_org2.blast_tab: + qname = filename.stem.split("_vs_")[0] + # Load output as dataframe + qfraglengths = fraglengths[qname] + columns = [ + "sid", + "blast_pid", + "blast_alnlen", + "blast_mismatch", + "blast_gaps", + "q_start", + "q_end", + "s_start", + "s_end", + "e_Value", + "bit_score", + ] + # We may receive an empty BLASTN output file, if there are no significant + # regions of homology. This causes pandas to throw an error on CSV import. + # To get past this, we create an empty dataframe with the appropriate + # columns. + try: + data = pd.read_csv(filename, header=None, sep="\t", index_col=0) + data.columns = columns + except pd.io.common.EmptyDataError: + data = pd.DataFrame(columns=columns) + # Add new column for fragment length, only for BLASTALL + data["qlen"] = pd.Series( + [qfraglengths[idx] for idx in data.index], index=data.index + ) + # Add new columns for recalculated alignment length, proportion, and + # percentage identity + data["ani_alnlen"] = data["blast_alnlen"] - data["blast_gaps"] + data["ani_alnids"] = data["ani_alnlen"] - data["blast_mismatch"] + data["ani_coverage"] = data["ani_alnlen"] / data["qlen"] + data["ani_pid"] = data["ani_alnids"] / data["qlen"] + # Filter rows on 'ani_coverage' > 0.7, 'ani_pid' > 0.3 + filtered = data[(data["ani_coverage"] > 0.7) & (data["ani_pid"] > 0.3)] + # Dedupe query hits, so we only take the best hit + filtered = filtered.groupby(filtered.index).first() + # Replace NaNs with zero + filtered = filtered.fillna(value=0) # Needed if no matches + # The ANI value is then the mean percentage identity. + # We report total alignment length and the number of similarity errors + # (mismatches and gaps), as for ANIm + # NOTE: We report the mean of 'blast_pid' for concordance with JSpecies + # Despite this, the concordance is not exact. Manual inspection during + # development indicated that a handful of fragments are differentially + # filtered out in JSpecies and this script. This is often on the basis + # of rounding differences (e.g. coverage being close to 70%). + # NOTE: If there are no hits, then ani_pid will be nan - we replace this + # with zero if that happens + ani_pid = filtered["blast_pid"].mean() + if pd.isnull(ani_pid): # Happens if there are no matches in ANIb + ani_pid = 0 + aln_length = filtered["ani_alnlen"].sum() + sim_errors = filtered["blast_mismatch"].sum() + filtered["blast_gaps"].sum() + filtered.to_csv(Path(filename).with_suffix(".blast_tab.dataframe"), sep="\t") + return aln_length, sim_errors, ani_pid From 39bdde19db3be4403348c8ab4fee51114e16ae6c Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 21 Jul 2021 19:54:58 +0100 Subject: [PATCH 02/75] Updated comments/logging --- pyani/scripts/subcommands/subcmd_anib.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index d16cdeb7..d46547c5 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -89,11 +89,11 @@ def subcmd_anib(args: Namespace) -> None: The calculated values are stored in the local SQLite3 database. """ + # Create logger logger = logging.getLogger(__name__) - logger.info( - termcolor("Running ANIm analysis", "red") - ) # announce that we're starting + # Announce the analysis + logger.info(termcolor("Running ANIb analysis", "red")) # Get BLAST+ version - this will be used in the database entries blastn_version = anib.get_version(args.blastn_exe) @@ -102,7 +102,7 @@ def subcmd_anib(args: Namespace) -> None: # Use provided name, or make new one for this analysis start_time = datetime.datetime.now() name = args.name or "_".join(["ANIb", start_time.isoformat()]) - logger.info("Analysis name: %s", name) + logger.info(termcolor("Analysis name: %s", "cyan"), name) # Connect to existing database (which may be "clean" or have old analyses) logger.debug("Connecting to database %s", args.dbpath) @@ -225,7 +225,7 @@ def subcmd_anib(args: Namespace) -> None: existingfiles[0], ) else: - logger.debug(f"\tIdentified no existing output files") + logger.debug("\tIdentified no existing output files") else: existingfiles = list() logger.debug("\tAssuming no pre-existing output files") @@ -248,7 +248,7 @@ def subcmd_anib(args: Namespace) -> None: joblist = generate_joblist( comparisons_to_run, existingfiles, fragfiles, fraglens, args ) - logger.debug(f"...created %s blastn jobs", len(joblist)) + logger.debug("...created %s blastn jobs", len(joblist)) raise NotImplementedError From 7e9ebe250922402ae913b850585d1ba61ad7afdd Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 21 Jul 2021 19:55:50 +0100 Subject: [PATCH 03/75] Added `sysexit` when adding genomes to the database fails --- pyani/scripts/subcommands/subcmd_anib.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index d46547c5..e412eab4 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -140,6 +140,7 @@ def subcmd_anib(args: Namespace) -> None: logger.error( "Could not add genomes to database for run %s (exiting)", run, exc_info=True ) + raise SystemExit(1) logger.debug("\t...added genome IDs: %s", genome_ids) # Get list of genomes for this analysis from the database From 70abeb91322379ad3888590b175da103d058b01d Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 21 Jul 2021 20:20:06 +0100 Subject: [PATCH 04/75] Added initial versions of functions and comments about needed code `run_anib_jobs()`, `update_comparison_results()` --- pyani/scripts/subcommands/subcmd_anib.py | 110 ++++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index e412eab4..5394558e 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -47,7 +47,7 @@ from argparse import Namespace from itertools import permutations from pathlib import Path -from typing import List, Tuple +from typing import List, NamedTuple, Tuple from Bio import SeqIO from tqdm import tqdm @@ -55,16 +55,33 @@ from pyani import anib from pyani.pyani_files import collect_existing_output from pyani.pyani_orm import ( - PyaniORMException, add_run, add_run_genomes, + add_blastdb, + Comparison, filter_existing_comparisons, get_session, + pyani_jobs, + PyaniORMException, update_comparison_matrices, ) from pyani.pyani_tools import termcolor +# Convenience struct describing a pairwise comparison job for the SQLAlchemy +# implementation +class ComparisonJob(NamedTuple): + + """Pairwise comparison job for the SQLAlchemy implementation""" + + query: str + subject: str + blastcmd: str + outfile: Path + fragsize: int + job: pyani_jobs.Job + + def subcmd_anib(args: Namespace) -> None: """Perform ANIb on all genome files in an input directory. @@ -164,6 +181,12 @@ def subcmd_anib(args: Namespace) -> None: os.makedirs(fragdir, exist_ok=True) os.makedirs(blastdbdir, exist_ok=True) + # jobgraph = anib.make_job_graph( + # infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir) + # ) + + # Create dictionary of database building commands, keyed by dbname + # Create a new sequence fragment file and a new BLAST+ database for each input genome, # and add this data to the database as a row in BlastDB logger.info("Creating input sequence fragment files") @@ -304,3 +327,86 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path fragpath = outdir / f"{inpath.stem}-fragments.fasta" SeqIO.write(outseqs, fragpath, "fasta") return fragpath, json.dumps(sizedict) + + +# def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: +# """Pass ANIb blastn jobs to the scheduler. +# +# :param joblist: list of ComparisonJob namedtuples +# :param args: command-line arguments for the run +# """ +# logger = logging.getLogger(__name__) +# logger.debug("Scheduler: %s", args.scheduler) +# +# # Entries with None seen in recovery mode: +# jobs = [_.job for _ in joblist if _.job] +# +# if args.scheduler == "multiprocessing": +# logger.info("Running jobs with multiprocessing") +# if not args.workers: +# logger.debug("(using maximum number of worker threads)") +# else: +# logger.debug("(using %d worker threads, if available)", args.workers) +# cumval = run_mp.run_dependency_graph(jobs, workers=args.workers) +# if cumval > 0: +# logger.error( +# "At least one blastn comparison failed. Please investigate (exiting)" +# ) +# raise PyaniException("Multiprocessing run failed in ANIb") +# logger.info("Multiprocessing run completed without error") +# elif args.scheduler.lower() == "sge": +# logger.info("Running jobs with SGE") +# logger.debug("Setting jobarray group size to %d", args.sgegroupsize) +# logger.debug("Joblist contains %d jobs", len(joblist)) +# run_sge.run_dependency_graph( +# jobs, +# jgprefix=args.jobprefix, +# sgegroupsize=args.sgegroupsize, +# sgeargs=args.sgeargs, +# ) +# else: +# logger.error(termcolor("Scheduler %s not recognised", "red"), args.scheduler) +# raise SystemError(1) +# +# +# def update_comparison_results( +# joblist: List[ComparisonJob], run, session, blastn_version: str, args: Namespace +# ) -> None: +# """Update the Comparision table with the completed result set. +# +# :param joblist: list of ComparisonJob namedtuples +# :param run: Run ORM object for the current ANIb run +# :param session: active pyanidb session via ORM +# :param blastn_version: version of blastn used for the comparison +# :param args: command-line arguments for this run +# +# The Comparison table stores individual comparison results, one per row. +# """ +# logger = logging.getLogger(__name__) +# +# # Add individual results to Comparison table +# for job in tqdm(joblist, disable=args.disable_tqdm): +# logger.debug("\t%s vs %s", job.query.description, job.subject.description) +# aln_length, sim_errs, ani_pid = anib.process_blast_tab(job.outfile) +# qcov = aln_length / job.query.length +# scov = aln_length / job.subject.length +# run.comparisons.append( +# Comparison( +# query=job.query, +# subject=job.subject, +# aln_length=aln_length, +# sim_errs=sim_errs, +# identity=ani_pid, +# cov_query=qcov, +# cov_subject=scov, +# program="blastn", +# version=blastn_version, +# fragsize=job.fragsize, +# maxmatch=False, +# ) +# ) +# +# # Populate db +# logger.debug("Committing results to database") +# session.commit() +# From 9dcade60d062a24ef3aae2fb9e2bcda199501e7a Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 21 Jul 2021 20:20:41 +0100 Subject: [PATCH 05/75] Rename `fraglens` to `fragsizes` for consistency with `anib.py` --- pyani/scripts/subcommands/subcmd_anib.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 5394558e..6555b96e 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -191,13 +191,12 @@ def subcmd_anib(args: Namespace) -> None: # and add this data to the database as a row in BlastDB logger.info("Creating input sequence fragment files") for genome in genomes: - fragpath, fraglengths = fragment_fasta_file( + fragpath, fragsizes = fragment_fasta_file( Path(str(genome.path)), Path(str(fragdir)), args.fragsize ) - print(fragpath, len(fraglengths)) - # blastdb = add_blastdb( - # session, genome, run, fragpath, dbpath, fraglengths, dbcmd - # ) + print(fragpath, len(fragsizes)) + + # blastdb = add_blastdb(session, genome, run, fragpath, dbpath, fragsizes, dbcmd) raise NotImplementedError @@ -281,7 +280,7 @@ def generate_joblist( comparisons: List, existingfiles: List, fragfiles: List, - fraglens: List, + fragsizes: List, args: Namespace, ) -> NotImplementedError: """Return list of ComparisonJobs. @@ -289,7 +288,7 @@ def generate_joblist( :param comparisons: list of (Genome, Genome) tuples for which comparisons are needed :param existingfiles: list of pre-existing BLASTN+ outputs :param fragfiles: - :param fraglens: + :param fragsizes: :param args: Namespace, command-line arguments """ # logger = logging.getLogger(__name__) From c660e63c61f9757670f2e6a2cd754bacec15d0e9 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Mon, 13 Sep 2021 23:53:11 +0100 Subject: [PATCH 06/75] Add `add_blastdb()` to `pyani_orm.py` --- pyani/pyani_orm.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pyani/pyani_orm.py b/pyani/pyani_orm.py index bb4aeb9e..e2b16c26 100644 --- a/pyani/pyani_orm.py +++ b/pyani/pyani_orm.py @@ -312,6 +312,39 @@ def __repr__(self) -> str: return "".format(self.comparison_id) +def add_blastdb(session, genome, run, fragpath, dbpath, fragsizes, dbcmd) -> None: + """Update the Blastdb table with a fragment dictionary for genome. + + :param session: active pyanidb session via ORM + :param genome: Genome ORM object + :param run: Run ORM object for the current ANIb run + :param fragpath: Path to the fragment sequence file for Genome + :param dbpath: Path to the database file + :param fragsizes: fragment size dictionary for Genome + :param dbcmd: command used to generate database + + """ + try: + blastdb = BlastDB( + genome_id=genome.genome_id, + run_id=run.run_id, + fragpath=str(fragpath), + dbpath=str(dbpath), + fragsizes=fragsizes, + dbcmd=dbcmd, + ) + except Exception: + raise PyaniORMException( + f"Could not create {genome} blast database with command line: {dbcmd}" + ) + try: + session.add(blastdb) + session.commit() + except Exception: + raise PyaniORMException(f"Could not add blastdb for {genome} to the database") + return blastdb + + def create_db(dbpath: Path) -> None: """Create an empty pyani SQLite3 database at the passed path. From bf4fa20f49a4e1070d4ce2b89ab2f7bca9ca58e3 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Mon, 13 Sep 2021 23:54:10 +0100 Subject: [PATCH 07/75] Change default for `maxmatch` from `None` to `False` If this is not a boolean, it prevents sqlite from recognising duplicate entries --- pyani/pyani_orm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/pyani_orm.py b/pyani/pyani_orm.py index e2b16c26..c2378fe4 100644 --- a/pyani/pyani_orm.py +++ b/pyani/pyani_orm.py @@ -433,7 +433,7 @@ def filter_existing_comparisons( program, version, fragsize: Optional[int] = None, - maxmatch: Optional[bool] = None, + maxmatch: Optional[bool] = False, ) -> List: """Filter list of (Genome, Genome) comparisons for those not in the session db. From c1b4be117e9ff0d2b211e2ed448ad40bfc4dc0dc Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:00:19 +0100 Subject: [PATCH 08/75] Add/expand code to process input genomes Up to adding them to the database --- pyani/scripts/subcommands/subcmd_anib.py | 40 ++++++++++++++++-------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 6555b96e..97d9c905 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -43,16 +43,24 @@ import json import logging import os +import subprocess from argparse import Namespace from itertools import permutations from pathlib import Path -from typing import List, NamedTuple, Tuple +from typing import List, NamedTuple, Tuple, Dict from Bio import SeqIO from tqdm import tqdm -from pyani import anib +from pyani import ( + PyaniException, + anib, + pyani_config, + pyani_jobs, + run_sge, + run_multiprocessing as run_mp, +) from pyani.pyani_files import collect_existing_output from pyani.pyani_orm import ( add_run, @@ -61,7 +69,6 @@ Comparison, filter_existing_comparisons, get_session, - pyani_jobs, PyaniORMException, update_comparison_matrices, ) @@ -181,24 +188,31 @@ def subcmd_anib(args: Namespace) -> None: os.makedirs(fragdir, exist_ok=True) os.makedirs(blastdbdir, exist_ok=True) - # jobgraph = anib.make_job_graph( - # infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir) - # ) - - # Create dictionary of database building commands, keyed by dbname - # Create a new sequence fragment file and a new BLAST+ database for each input genome, # and add this data to the database as a row in BlastDB logger.info("Creating input sequence fragment files") + fragfiles = {} + fraglens = {} for genome in genomes: fragpath, fragsizes = fragment_fasta_file( Path(str(genome.path)), Path(str(fragdir)), args.fragsize ) - print(fragpath, len(fragsizes)) - - # blastdb = add_blastdb(session, genome, run, fragpath, dbpath, fragsizes, dbcmd) + fragfiles.update({genome: fragpath}) + fraglens.update({genome: fragsizes}) + + dbcmd, blastdbpath = anib.construct_makeblastdb_cmd( + fragpath, blastdbdir + ) # args.outdir) + + subprocess.run( + dbcmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) - raise NotImplementedError + add_blastdb(session, genome, run, fragpath, blastdbpath, fragsizes, dbcmd) # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples logger.info( From 34f9ec31579a7ac1929844498e3970b82e36dc31 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:03:42 +0100 Subject: [PATCH 09/75] Split genome files into contiguous fragments --- pyani/scripts/subcommands/subcmd_anib.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 97d9c905..66e7cc32 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -267,18 +267,18 @@ def subcmd_anib(args: Namespace) -> None: existingfiles = list() logger.debug("\tAssuming no pre-existing output files") - # Split the input genome files into contiguous fragments of the specified size, - # as described in Goris et al. We create a new directory to hold sequence - # fragments, away from the main genomes - logger.info("Splitting input genome files into %snt fragments...", args.fragsize) - fragdir = Path(args.outdir) / "fragments" - os.makedirs(fragdir, exist_ok=True) - fragfiles, fraglens = anib.fragment_fasta_files( - [Path(str(_.path)) for _ in genomes], - Path(args.outdir) / "fragments", - args.fragsize, - ) - logger.debug("...wrote %s fragment files to %s", len(fragfiles), fragdir) + ## Split the input genome files into contiguous fragments of the specified size, + ## as described in Goris et al. We create a new directory to hold sequence + ## fragments, away from the main genomes + # logger.info("Splitting input genome files into %snt fragments...", args.fragsize) + # fragdir = Path(args.outdir) / "fragments" + # os.makedirs(fragdir, exist_ok=True) + # fragfiles, fraglens = anib.fragment_fasta_files( + # [Path(str(_.path)) for _ in genomes], + # Path(args.outdir) / "fragments", + # args.fragsize, + # ) + # logger.debug("...wrote %s fragment files to %s", len(fragfiles), fragdir) # Create list of BLASTN jobs for each comparison still to be performed logger.info("Creating blastn jobs for ANIb...") From 5cdd45298df4ff28c3bc04b27eb1f1e4237ee9cf Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:06:03 +0100 Subject: [PATCH 10/75] Implement `generate_joblist()` --- pyani/scripts/subcommands/subcmd_anib.py | 50 +++++++++++++++++++++--- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 66e7cc32..d7b3f63c 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -296,17 +296,57 @@ def generate_joblist( fragfiles: List, fragsizes: List, args: Namespace, -) -> NotImplementedError: +) -> List[ComparisonJob]: """Return list of ComparisonJobs. :param comparisons: list of (Genome, Genome) tuples for which comparisons are needed :param existingfiles: list of pre-existing BLASTN+ outputs - :param fragfiles: - :param fragsizes: + :param fragfiles: list of files containing genome fragments + :param fragsizes: list of fragment lengths :param args: Namespace, command-line arguments """ - # logger = logging.getLogger(__name__) - raise NotImplementedError + logger = logging.getLogger(__name__) + + existingfiles = set(existingfiles) # Path objects hashable + + joblist = [] # will hold ComparisonJob structs + for idx, (query, subject) in enumerate( + tqdm(comparisons, disable=args.disable_tqdm) + ): + # fname1 = Path(query.path) + # fname2 = Path(subject.path) + blastcmd = anib.generate_blastn_commands( + Path(query.path), Path(subject.path), args.outdir, args.blastn_exe + ) + logger.debug("Commands to run:\n\t%s\n", blastcmd) + outprefix = blastcmd.split()[2][:-10] # prefix for blastn output + outfname = Path(outprefix + ".blast_tab") + logger.debug("Expected output file for db: %s", outfname) + + # If e're in recovery mode, we don't want to repeat a computational + # comparison that already exists, so we check whether the ultimate + # output is in the set of existing files and, if not, we add the obs + + # The comparisons collection always gets updated, so that results are + # added to the database whether they come from recovery mode or are run + # in this call of the script. + if args.recovery and outfname in existingfiles: + logger.debug("Recovering output from %s, not submitting job", outfname) + # Need to track the expected output, but set the job itself to None: + joblist.append( + ComparisonJob(query, subject, blastcmd, outfname, args.fragsize, None) + ) + else: + logger.debug("Building job") + # Build job + blastjob = pyani_jobs.Job("%s_%06d-blast" % (args.jobprefix, idx), blastcmd) + joblist.append( + ComparisonJob( + query, subject, blastcmd, outfname, args.fragsize, blastjob + ) + ) + return joblist + # raise NotImplementedError def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]: From d78b4799fa458da3ee8e3b4ba6ae528a0ba2777d Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:08:08 +0100 Subject: [PATCH 11/75] Implement `run_anib_jobs()` --- pyani/scripts/subcommands/subcmd_anib.py | 78 ++++++++++++------------ 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index d7b3f63c..448555f8 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -382,44 +382,46 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path return fragpath, json.dumps(sizedict) -# def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: -# """Pass ANIb blastn jobs to the scheduler. -# -# :param joblist: list of ComparisonJob namedtuples -# :param args: command-line arguments for the run -# """ -# logger = logging.getLogger(__name__) -# logger.debug("Scheduler: %s", args.scheduler) -# -# # Entries with None seen in recovery mode: -# jobs = [_.job for _ in joblist if _.job] -# -# if args.scheduler == "multiprocessing": -# logger.info("Running jobs with multiprocessing") -# if not args.workers: -# logger.debug("(using maximum number of worker threads)") -# else: -# logger.debug("(using %d worker threads, if available)", args.workers) -# cumval = run_mp.run_dependency_graph(jobs, workers=args.workers) -# if cumval > 0: -# logger.error( -# "At least one blastn comparison failed. Please investigate (exiting)" -# ) -# raise PyaniException("Multiprocessing run failed in ANIb") -# logger.info("Multiprocessing run completed without error") -# elif args.scheduler.lower() == "sge": -# logger.info("Running jobs with SGE") -# logger.debug("Setting jobarray group size to %d", args.sgegroupsize) -# logger.debug("Joblist contains %d jobs", len(joblist)) -# run_sge.run_dependency_graph( -# jobs, -# jgprefix=args.jobprefix, -# sgegroupsize=args.sgegroupsize, -# sgeargs=args.sgeargs, -# ) -# else: -# logger.error(termcolor("Scheduler %s not recognised", "red"), args.scheduler) -# raise SystemError(1) +def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: + """Pass ANIb blastn jobs to the scheduler. + + :param joblist: list of ComparisonJob namedtuples + :param args: command-line arguments for the run + """ + logger = logging.getLogger(__name__) + logger.debug("Scheduler: %s", args.scheduler) + + # Entries with None seen in recovery mode: + jobs = [_.job for _ in joblist if _.job] + + if args.scheduler == "multiprocessing": + logger.info("Running jobs with multiprocessing") + if not args.workers: + logger.debug("(using maximum number of worker threads)") + else: + logger.debug("(using %d worker threads, if available)", args.workers) + cumval = run_mp.run_dependency_graph(jobs, workers=args.workers) + if cumval > 0: + logger.error( + "At least one blastn comparison failed. Please investigate (exiting)" + ) + raise PyaniException("Multiprocessing run failed in ANIb") + logger.info("Multiprocessing run completed without error") + elif args.scheduler.lower() == "sge": + logger.info("Running jobs with SGE") + logger.debug("Setting jobarray group size to %d", args.sgegroupsize) + logger.debug("Joblist contains %d jobs", len(joblist)) + run_sge.run_dependency_graph( + jobs, + jgprefix=args.jobprefix, + sgegroupsize=args.sgegroupsize, + sgeargs=args.sgeargs, + ) + else: + logger.error(termcolor("Scheduler %s not recognised", "red"), args.scheduler) + raise SystemError(1) + + # # # def update_comparison_results( From 0d42e79556c15d9af83a2507709f09febd12f8de Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:09:37 +0100 Subject: [PATCH 12/75] Implement `update_comparisons_results()` and commit to database --- pyani/scripts/subcommands/subcmd_anib.py | 87 +++++++++++++----------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 448555f8..f58b2508 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -424,44 +424,49 @@ def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: # # -# def update_comparison_results( -# joblist: List[ComparisonJob], run, session, blastn_version: str, args: Namespace -# ) -> None: -# """Update the Comparision table with the completed result set. -# -# :param joblist: list of ComparisonJob namedtuples -# :param run: Run ORM object for the current ANIb run -# :param session: active pyanidb session via ORM -# :param blastn_version: version of blastn used for the comparison -# :param args: command-line arguments for this run -# -# The Comparison table stores individual comparison results, one per row. -# """ -# logger = logging.getLogger(__name__) -# -# # Add individual results to Comparison table -# for job in tqdm(joblist, disable=args.disable_tqdm): -# logger.debug("\t%s vs %s", job.query.description, job.subject.description) -# aln_length, sim_errs, ani_pid = anib.process_blast_tab(job.outfile) -# qcov = aln_length / job.query.length -# scov = aln_length / job.subject.length -# run.comparisons.append( -# Comparison( -# query=job.query, -# subject=job.subject, -# aln_length=aln_length, -# sim_errs=sim_errs, -# identity=ani_pid, -# cov_query=qcov, -# cov_subject=scov, -# program="blastn", -# version=blastn_version, -# fragsize=job.fragsize, -# maxmatch=False, -# ) -# ) -# -# # Populate db -# logger.debug("Committing results to database") -# session.commit() -# +def update_comparison_results( + joblist: List[ComparisonJob], + run, + session, + blastn_version: str, + fraglens: Dict, + args: Namespace, +) -> None: + """Update the Comparision table with the completed result set. + # + :param joblist: list of ComparisonJob namedtuples + :param run: Run ORM object for the current ANIb run + :param session: active pyanidb session via ORM + :param blastn_version: version of blastn used for the comparison + :param fraglens: dictionary of fragment lengths for each genome + :param args: command-line arguments for this run + # + The Comparison table stores individual comparison results, one per row. + """ + logger = logging.getLogger(__name__) + # + # Add individual results to Comparison table + for job in tqdm(joblist, disable=args.disable_tqdm): + logger.debug("\t%s vs %s", job.query.description, job.subject.description) + aln_length, sim_errs, ani_pid = anib.parse_blast_tab(job.outfile, fraglens) + qcov = aln_length / job.query.length + scov = aln_length / job.subject.length + run.comparisons.append( + Comparison( + query=job.query, + subject=job.subject, + aln_length=aln_length, + sim_errs=sim_errs, + identity=ani_pid, + cov_query=qcov, + cov_subject=scov, + program="blastn", + version=blastn_version, + fragsize=job.fragsize, + maxmatch=False, + ) + ) + # + # Populate db + logger.debug("Committing results to database") + session.commit() From 9e25141c541fc8915fb6ff0cceee39680b527f4e Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:11:13 +0100 Subject: [PATCH 13/75] Update call to `generate_joblist()` Now passes the lists of all `fragfiles` and `fraglens` --- pyani/scripts/subcommands/subcmd_anib.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index f58b2508..39cdab25 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -282,8 +282,10 @@ def subcmd_anib(args: Namespace) -> None: # Create list of BLASTN jobs for each comparison still to be performed logger.info("Creating blastn jobs for ANIb...") + # This method considered, but doesn't get fraglens + # fragfiles = [file for file in fragdir.iterdir()] joblist = generate_joblist( - comparisons_to_run, existingfiles, fragfiles, fraglens, args + comparisons_to_run, existingfiles, fragfiles.values(), fraglens.values(), args ) logger.debug("...created %s blastn jobs", len(joblist)) From 1e8a47dabf5417f25c5f640190554196fca67e87 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:12:19 +0100 Subject: [PATCH 14/75] Update name of output file in `fragment_fasta_file()` --- pyani/scripts/subcommands/subcmd_anib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 39cdab25..70a0d951 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -379,7 +379,7 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path idx += fragsize # Write fragments to output file - fragpath = outdir / f"{inpath.stem}-fragments.fasta" + fragpath = outdir / f"{inpath.stem}-fragments.fna" SeqIO.write(outseqs, fragpath, "fasta") return fragpath, json.dumps(sizedict) From b07a1dd59414024481176320caf91b8f22c60361 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:13:10 +0100 Subject: [PATCH 15/75] Update value passed for `maxmatch` to a boolean Necessary for sqlite to recognise duplicate values --- pyani/scripts/subcommands/subcmd_anib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 70a0d951..8e6f8da6 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -226,7 +226,7 @@ def subcmd_anib(args: Namespace) -> None: # but remove it from the list of comparisons to be performed logger.info("Checking database for existing comparison data...") comparisons_to_run = filter_existing_comparisons( - session, run, comparisons, "blastn", blastn_version, args.fragsize, None + session, run, comparisons, "blastn", blastn_version, args.fragsize, False ) logger.info( f"\t...after check, still need to run {len(comparisons_to_run)} comparisons" From 8f4b131fb9a7be342ad361fbd331b52fcf5638c9 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:13:58 +0100 Subject: [PATCH 16/75] Implement remainder of `subcmd_anib()` --- pyani/scripts/subcommands/subcmd_anib.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 8e6f8da6..5d8e64b3 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -288,8 +288,21 @@ def subcmd_anib(args: Namespace) -> None: comparisons_to_run, existingfiles, fragfiles.values(), fraglens.values(), args ) logger.debug("...created %s blastn jobs", len(joblist)) + # print(f"Joblist: {joblist}") + # Pass jobs to appropriate scheduler + logger.debug("Passing %s jobs to %s...", len(joblist), args.scheduler) + run_anib_jobs(joblist, args) + logger.info("...jobs complete.") + + # Process output and add results to database + # This requires us to drop out of threading/multiprocessing: Python's SQLite3 + # interface doesn't allow sharing connections and cursors + logger.info("Adding comparison results to database...") + update_comparison_results(joblist, run, session, blastn_version, fraglens, args) + update_comparison_matrices(session, run) + logger.info("...database updated.") - raise NotImplementedError + # raise NotImplementedError def generate_joblist( From c76f54c3a7fe88aed7d448f334c98ce57f143b5f Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:14:11 +0100 Subject: [PATCH 17/75] Add a commented question --- pyani/scripts/subcommands/subcmd_anib.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 5d8e64b3..abf1cda6 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -249,6 +249,7 @@ def subcmd_anib(args: Namespace) -> None: # run, and do not necessarily need to rerun all the jobs. In this case, # we prepare a list of output files we want to recover from the results # in the output directory. + # ΒΆ Should this use output files, or pull from the database? if args.recovery: logger.warning("Entering recovery mode...") logger.debug( From c71ce5d58bfa890578c44d3994e545b506c7126d Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:17:14 +0100 Subject: [PATCH 18/75] Alter `generate_blastn_commands()` to only take one query/subject pair --- pyani/anib.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/pyani/anib.py b/pyani/anib.py index 89848e49..7bb28376 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -343,13 +343,15 @@ def construct_makeblastdb_cmd( # Generate list of BLASTN command lines from passed filenames def generate_blastn_commands( - filenames: List[Path], + query: Path, + subject: Path, outdir: Path, blast_exe: Optional[Path] = None, ) -> List[str]: """Return a list of blastn command-lines for ANIb. - :param filenames: a list of paths to fragmented input FASTA files + :param query: a paths to the query's fragmented input FASTA file + :param subject: a paths to the subject's fragmented input FASTA file :param outdir: path to output directory :param blastn_exe: path to BLASTN executable :param method: str, analysis type (ANIb) @@ -359,22 +361,12 @@ def generate_blastn_commands( have the form ACCESSION.ext. This is the convention followed by the fragment_FASTA_files() function above. """ - cmdlines = [] - for idx, fname1 in enumerate(filenames[:-1]): - dbname1 = Path(str(fname1).replace("-fragments", "")) - for fname2 in filenames[idx + 1 :]: - dbname2 = Path(str(fname2).replace("-fragments", "")) - if blast_exe is None: - cmdlines.append(construct_blastn_cmdline(fname1, dbname2, outdir)) - cmdlines.append(construct_blastn_cmdline(fname2, dbname1, outdir)) - else: - cmdlines.append( - construct_blastn_cmdline(fname1, dbname2, outdir, blast_exe) - ) - cmdlines.append( - construct_blastn_cmdline(fname2, dbname1, outdir, blast_exe) - ) - return cmdlines + subj_db = outdir / "blastdbs" / Path(str(subject.name)) + if blast_exe is None: + cmdline = construct_blastn_cmdline(query, subj_db, outdir) + else: + cmdline = construct_blastn_cmdline(query, subj_db, outdir, blast_exe) + return cmdline # Generate single BLASTN command line From 585b98f119014fff57973dea5de24d0b237ec95d Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:18:20 +0100 Subject: [PATCH 19/75] Alter `construct_blastn_cmdline()` for a single query/subject pair --- pyani/anib.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pyani/anib.py b/pyani/anib.py index 7bb28376..11a75b62 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -371,21 +371,21 @@ def generate_blastn_commands( # Generate single BLASTN command line def construct_blastn_cmdline( - fname1: Path, - fname2: Path, + query: Path, + subj_db: Path, outdir: Path, blastn_exe: Path = pyani_config.BLASTN_DEFAULT, ) -> str: """Return a single blastn command. - :param fname1: - :param fname2: - :param outdir: + :param fname1: Path, FASTA file for query genome + :param fname2: Path, database of fragments for subject genome + :param outdir: Path, to the output directory :param blastn_exe: str, path to blastn executable """ - prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}" + prefix = outdir / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}" return ( - f"{blastn_exe} -out {prefix}.blast_tab -query {fname1} -db {fname2} " + f"{blastn_exe} -out {prefix}.blast_tab -query {query} -db {subj_db} " "-xdrop_gap_final 150 -dust no -evalue 1e-15 -max_target_seqs 1 -outfmt " "'6 qseqid sseqid length mismatch pident nident qlen slen " "qstart qend sstart send positive ppos gaps' " From 059ffdf6b3120de7cb3ac7f26fd5b6cbe91c4fba Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:18:49 +0100 Subject: [PATCH 20/75] Remove `method` parameter from `process_blast()` call --- pyani/anib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/anib.py b/pyani/anib.py index 11a75b62..b9c111b6 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -453,7 +453,7 @@ def process_blast( blastfile, ) continue - resultvals = parse_blast_tab(blastfile, fraglengths, method) + resultvals = parse_blast_tab(blastfile, fraglengths) query_cover = float(resultvals[0]) / org_lengths[qname] # Populate dataframes: when assigning data, we need to note that From aed81e1ebf9440a5df62f2b88235268082f57e25 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:20:00 +0100 Subject: [PATCH 21/75] Change `outfilename` in `construct_makeblastdb_cmd() ` --- pyani/anib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/anib.py b/pyani/anib.py index b9c111b6..3e8d9476 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -334,7 +334,7 @@ def construct_makeblastdb_cmd( :param outdir: Path, directory for output :param blastdb_exe: Path, path to the makeblastdb executable """ - outfilename = outdir / filename.name + outfilename = outdir / filename.name.replace("-fragments", "") return ( f"{blastdb_exe} -dbtype nucl -in {filename} -title {filename.stem} -out {outfilename}", outfilename, From 0f0653ffd2b22ce5c2087d7a7bfec6419a508a0a Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:21:05 +0100 Subject: [PATCH 22/75] Move `aniblastall`-specific tests to `test_aniblastall.py` --- tests/test_anib.py | 111 +--------------------- tests/test_aniblastall.py | 189 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+), 110 deletions(-) create mode 100644 tests/test_aniblastall.py diff --git a/tests/test_anib.py b/tests/test_anib.py index 4ba75099..b15423bd 100644 --- a/tests/test_anib.py +++ b/tests/test_anib.py @@ -119,65 +119,6 @@ def anib_output_dir(dir_anib_in): ) -# Test legacy BLAST (blastall) command generation -def test_blastall_dbjobdict(path_fna_all, tmp_path): - """Generate dictionary of legacy BLASTN database jobs.""" - blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path) - jobdict = anib.build_db_jobs(path_fna_all, blastcmds) - expected = [ - (tmp_path / _.name, f"formatdb -p F -i {tmp_path / _.name} -t {_.stem}") - for _ in path_fna_all - ] - assert sorted([(k, v.script) for (k, v) in jobdict.items()]) == sorted(expected) - - -def test_blastall_graph(path_fna_all, tmp_path, fragment_length): - """Create jobgraph for legacy BLASTN jobs.""" - fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length) - blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path) - jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds) - # We check that the main script job is a blastn job, and that there - # is a single dependency, which is a makeblastdb job - for job in jobgraph: - assert job.script.startswith("blastall -p blastn") - assert len(job.dependencies) == 1 - assert job.dependencies[0].script.startswith("formatdb") - - -def test_blastall_multiple(path_fna_two, tmp_path): - """Generate legacy BLASTN commands.""" - cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIblastall") - expected = [ - ( - "blastall -p blastn -o " - f"{tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} " - f"-i {path_fna_two[0]} " - f"-d {path_fna_two[1]} " - "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" - ), - ( - "blastall -p blastn -o " - f"{tmp_path / str(path_fna_two[1].stem + '_vs_' + path_fna_two[0].stem + '.blast_tab')} " - f"-i {path_fna_two[1]} " - f"-d {path_fna_two[0]} " - "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" - ), - ] - assert cmds == expected - - -def test_blastall_single(path_fna_two, tmp_path): - """Generate legacy BLASTN command-line.""" - cmd = anib.construct_blastall_cmdline(path_fna_two[0], path_fna_two[1], tmp_path) - expected = ( - f"blastall -p blastn -o {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} " - f"-i {path_fna_two[0]} " - f"-d {path_fna_two[1]} " - "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" - ) - assert cmd == expected - - # Test BLAST+ (blastn) command generation def test_blastn_dbjobdict(path_fna_all, tmp_path): """Generate dictionary of BLASTN+ database jobs.""" @@ -248,30 +189,6 @@ def test_blastn_single(path_fna_two, tmp_path): assert cmd == expected -# Test legacy BLAST database formatting (formatdb) command generation -def test_formatdb_multiple(path_fna_two, tmp_path): - """Generate legacy BLAST db creation commands.""" - cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIblastall") - expected = [ - ( - f"formatdb -p F -i {tmp_path / path_fna_two[0].name} -t {path_fna_two[0].stem}", - tmp_path / path_fna_two[0].name, - ), - ( - f"formatdb -p F -i {tmp_path / path_fna_two[1].name} -t {path_fna_two[1].stem}", - tmp_path / path_fna_two[1].name, - ), - ] - assert cmds == expected - - -def test_formatdb_single(path_fna, tmp_path): - """Generate legacy BLAST formatdb command-line.""" - cmd = anib.construct_formatdb_cmd(path_fna, tmp_path) - expected = f"formatdb -p F -i {tmp_path / path_fna.name} -t {path_fna.stem}" - assert cmd[0] == expected - - # Test FASTA file fragmentation for ANIb methods def test_fragment_files(path_fna_all, tmp_path, dir_tgt_fragments, fragment_length): """Fragment files for ANIb/ANIblastall.""" @@ -317,26 +234,11 @@ def test_makeblastdb_single(path_fna, tmp_path): # Test output file parsing for ANIb methods -def test_parse_legacy_blastdir(anib_output_dir): - """Parses directory of legacy BLAST output.""" - orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles) - fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles) - result = anib.process_blast( - anib_output_dir.legacyblastdir, orglengths, fraglengths, mode="ANIblastall" - ) - assert_frame_equal( - result.percentage_identity.sort_index(1).sort_index(), - anib_output_dir.legacyblastresult.sort_index(1).sort_index(), - ) - - def test_parse_blastdir(anib_output_dir): """Parse directory of BLAST+ output.""" orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles) fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles) - result = anib.process_blast( - anib_output_dir.blastdir, orglengths, fraglengths, mode="ANIb" - ) + result = anib.process_blast(anib_output_dir.blastdir, orglengths, fraglengths) assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), anib_output_dir.blastresult.sort_index(1).sort_index(), @@ -348,14 +250,3 @@ def test_parse_blasttab(anib_output): fragdata = anib.get_fraglength_dict([anib_output.fragfile]) result = anib.parse_blast_tab(anib_output.tabfile, fragdata, mode="ANIb") assert (a == b for a, b in zip(result, [4_016_551, 93, 99.997_693_577_050_029])) - - -def test_parse_legacy_blasttab(anib_output): - """Parses ANIB legacy .blast_tab output.""" - fragdata = anib.get_fraglength_dict([anib_output.fragfile]) - result = anib.parse_blast_tab( - anib_output.legacytabfile, fragdata, mode="ANIblastall" - ) - assert ( - a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018]) - ) diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py new file mode 100644 index 00000000..339c9263 --- /dev/null +++ b/tests/test_aniblastall.py @@ -0,0 +1,189 @@ +"""Test anib.py module. + +These tests are intended to be run from the repository root using: + +pytest -v +""" + +from pathlib import Path +from typing import List, NamedTuple + +import pandas as pd +import pytest # noqa: F401 # pylint: disable=unused-import + +from pandas.util.testing import assert_frame_equal + +from pyani import anib, pyani_files + + +class ANIbOutput(NamedTuple): + + """Convenience struct for ANIb output.""" + + fragfile: Path + tabfile: Path + legacytabfile: Path + + +class ANIbOutputDir(NamedTuple): + + """Convenience struct for ANIb output.""" + + infiles: List[Path] + fragfiles: List[Path] + blastdir: Path + legacyblastdir: Path + blastresult: pd.DataFrame + legacyblastresult: pd.DataFrame + + +@pytest.fixture +def anib_output(dir_anib_in): + """Namedtuple of example ANIb output. + + fragfile - fragmented FASTA query file + tabfile - BLAST+ tabular output + legacytabfile - blastall tabular output + """ + return ANIbOutput( + dir_anib_in / "NC_002696-fragments.fna", + dir_anib_in / "NC_002696_vs_NC_011916.blast_tab", + dir_anib_in / "NC_002696_vs_NC_010338.blast_tab", + ) + + +@pytest.fixture +def anib_output_dir(dir_anib_in): + """Namedtuple of example ANIb output - full directory. + + infiles - list of FASTA query files + fragfiles - list of fragmented FASTA query files + blastdir - path to BLAST+ output data + legacyblastdir - path to blastall output data + blastresult - pd.DataFrame result for BLAST+ + legacyblastresult - pd.DataFrame result for blastall + """ + return ANIbOutputDir( + [ + _ + for _ in (dir_anib_in / "sequences").iterdir() + if _.is_file() and _.suffix == ".fna" + ], + [ + _ + for _ in (dir_anib_in / "fragfiles").iterdir() + if _.is_file() and _.suffix == ".fna" + ], + dir_anib_in / "blastn", + dir_anib_in / "blastall", + pd.read_csv(dir_anib_in / "dataframes" / "blastn_result.csv", index_col=0), + pd.read_csv(dir_anib_in / "dataframes" / "blastall_result.csv", index_col=0), + ) + + +# Test legacy BLAST (blastall) command generation +def test_blastall_dbjobdict(path_fna_all, tmp_path): + """Generate dictionary of legacy BLASTN database jobs.""" + blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path) + jobdict = anib.build_db_jobs(path_fna_all, blastcmds) + expected = [ + (tmp_path / _.name, f"formatdb -p F -i {tmp_path / _.name} -t {_.stem}") + for _ in path_fna_all + ] + assert sorted([(k, v.script) for (k, v) in jobdict.items()]) == sorted(expected) + + +def test_blastall_graph(path_fna_all, tmp_path, fragment_length): + """Create jobgraph for legacy BLASTN jobs.""" + fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length) + blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path) + jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds) + # We check that the main script job is a blastn job, and that there + # is a single dependency, which is a makeblastdb job + for job in jobgraph: + assert job.script.startswith("blastall -p blastn") + assert len(job.dependencies) == 1 + assert job.dependencies[0].script.startswith("formatdb") + + +def test_blastall_multiple(path_fna_two, tmp_path): + """Generate legacy BLASTN commands.""" + cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIblastall") + expected = [ + ( + "blastall -p blastn -o " + f"{tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} " + f"-i {path_fna_two[0]} " + f"-d {path_fna_two[1]} " + "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" + ), + ( + "blastall -p blastn -o " + f"{tmp_path / str(path_fna_two[1].stem + '_vs_' + path_fna_two[0].stem + '.blast_tab')} " + f"-i {path_fna_two[1]} " + f"-d {path_fna_two[0]} " + "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" + ), + ] + assert cmds == expected + + +def test_blastall_single(path_fna_two, tmp_path): + """Generate legacy BLASTN command-line.""" + cmd = anib.construct_blastall_cmdline(path_fna_two[0], path_fna_two[1], tmp_path) + expected = ( + f"blastall -p blastn -o {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} " + f"-i {path_fna_two[0]} " + f"-d {path_fna_two[1]} " + "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" + ) + assert cmd == expected + + +# Test legacy BLAST database formatting (formatdb) command generation +def test_formatdb_multiple(path_fna_two, tmp_path): + """Generate legacy BLAST db creation commands.""" + cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIblastall") + expected = [ + ( + f"formatdb -p F -i {tmp_path / path_fna_two[0].name} -t {path_fna_two[0].stem}", + tmp_path / path_fna_two[0].name, + ), + ( + f"formatdb -p F -i {tmp_path / path_fna_two[1].name} -t {path_fna_two[1].stem}", + tmp_path / path_fna_two[1].name, + ), + ] + assert cmds == expected + + +def test_formatdb_single(path_fna, tmp_path): + """Generate legacy BLAST formatdb command-line.""" + cmd = anib.construct_formatdb_cmd(path_fna, tmp_path) + expected = f"formatdb -p F -i {tmp_path / path_fna.name} -t {path_fna.stem}" + assert cmd[0] == expected + + +# Test output file parsing for ANIb methods +def test_parse_legacy_blastdir(anib_output_dir): + """Parses directory of legacy BLAST output.""" + orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles) + fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles) + result = anib.process_blast( + anib_output_dir.legacyblastdir, orglengths, fraglengths, mode="ANIblastall" + ) + assert_frame_equal( + result.percentage_identity.sort_index(1).sort_index(), + anib_output_dir.legacyblastresult.sort_index(1).sort_index(), + ) + + +def test_parse_legacy_blasttab(anib_output): + """Parses ANIB legacy .blast_tab output.""" + fragdata = anib.get_fraglength_dict([anib_output.fragfile]) + result = anib.parse_blast_tab( + anib_output.legacytabfile, fragdata, mode="ANIblastall" + ) + assert ( + a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018]) + ) From fc273df93d786038b6780fcb587ceb5d01b9ff2c Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:21:55 +0100 Subject: [PATCH 23/75] Change call to `parse_blast_tab()` to not use `method` parameter --- tests/test_anib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_anib.py b/tests/test_anib.py index b15423bd..4267697b 100644 --- a/tests/test_anib.py +++ b/tests/test_anib.py @@ -248,5 +248,5 @@ def test_parse_blastdir(anib_output_dir): def test_parse_blasttab(anib_output): """Parse ANIb BLAST+ .blast_tab output.""" fragdata = anib.get_fraglength_dict([anib_output.fragfile]) - result = anib.parse_blast_tab(anib_output.tabfile, fragdata, mode="ANIb") + result = anib.parse_blast_tab(anib_output.tabfile, fragdata) assert (a == b for a, b in zip(result, [4_016_551, 93, 99.997_693_577_050_029])) From cc4d3465798e9df5612d1c2dc133893d5a72c1b3 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:24:23 +0100 Subject: [PATCH 24/75] Skip tests as a result of changes to how command lines are generated Currently, we don't pass a list of genome pairs to `anib.py`. --- tests/test_anib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_anib.py b/tests/test_anib.py index 4267697b..cc6d73d4 100644 --- a/tests/test_anib.py +++ b/tests/test_anib.py @@ -147,6 +147,7 @@ def test_blastn_graph(path_fna_all, tmp_path, fragment_length): assert job.dependencies[0].script.startswith("makeblastdb") +@pytest.mark.skip(reason="unsure this is needed") def test_blastn_multiple(path_fna_two, tmp_path): """Generate BLASTN+ commands.""" # BLAST+ @@ -201,6 +202,7 @@ def test_fragment_files(path_fna_all, tmp_path, dir_tgt_fragments, fragment_leng # Test BLAST+ database formatting (makeblastdb) command generation +@pytest.mark.skip(reason="unsure this is needed") def test_makeblastdb_multiple(path_fna_two, tmp_path): """Generate multiple BLAST+ makeblastdb command-lines.""" cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIb") From 0e3d471c4c52a07ab3bf6d29f95857d8c2588ba0 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 00:35:36 +0100 Subject: [PATCH 25/75] Add `get_version()` tests and boilerplate --- tests/test_aniblastall.py | 81 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py index 339c9263..2807ea35 100644 --- a/tests/test_aniblastall.py +++ b/tests/test_aniblastall.py @@ -1,7 +1,42 @@ -"""Test anib.py module. - +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# (c) University of Strathclyde 2019-2021 +# Author: Bailey Harrington +# +# Contact: +# leighton.pritchard@strath.ac.uk +# +# Leighton Pritchard, +# Strathclyde Institute for Pharmacy and Biomedical Sciences, +# 161 Cathedral Street, +# Glasgow, +# G4 0RE +# Scotland, +# UK +# +# The MIT License +# +# Copyright (c) 2019-2021 University of Strathclyde +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +"""Test aniblastall.py module. These tests are intended to be run from the repository root using: - pytest -v """ @@ -14,6 +49,7 @@ from pandas.util.testing import assert_frame_equal from pyani import anib, pyani_files +from pyani import aniblastall class ANIbOutput(NamedTuple): @@ -81,6 +117,45 @@ def anib_output_dir(dir_anib_in): ) +# Test get_version() +# Test case 1: there is no executable +def test_get_version_missing_exe(executable_missing): + """Test behaviour when there is no file at the specified executable location.""" + test_file_1 = Path("/non/existent/blastall") + assert aniblastall.get_version(test_file_1) == f"No blastall at {test_file_1}" + + +# Test case 2: there is a file, but it is not executable +def test_get_version_not_executable(executable_not_executable): + """Test behaviour when the file at the executable location is not executable.""" + test_file_2 = Path("/non/executable/blastall") + assert ( + aniblastall.get_version(test_file_2) + == f"blastall exists at {test_file_2} but not executable" + ) + + +# Test case 3: there is an executable file, but the version can't be retrieved +def test_get_version_no_version(executable_without_version): + """Test behaviour when the version for the executable can not be retrieved.""" + test_file_3 = Path("/missing/version/blastall") + assert ( + aniblastall.get_version(test_file_3) + == f"blastall exists at {test_file_3} but could not retrieve version" + ) + + +# Test case 4: there is an executable file, but it will not run on the OS +def test_get_version_os_incompatible(executable_incompatible_with_os): + """Test behaviour when the program can't run on the operating system. + This will happen with newer versions of MacOS.""" + test_file_4 = Path("/os/incompatible/blastall") + assert ( + aniblastall.get_version(test_file_4) + == f"blastall exists at {test_file_4} but could not be executed" + ) + + # Test legacy BLAST (blastall) command generation def test_blastall_dbjobdict(path_fna_all, tmp_path): """Generate dictionary of legacy BLASTN database jobs.""" From 8f94ba05c4a16d2d16f841d62efa0d70bbfabf2b Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 14 Sep 2021 21:52:27 +0100 Subject: [PATCH 26/75] Make changes to reflect splitting of `anib` and `aniblastall` - remove `mode` or `method` arguments in function calls/definitions - change references to module names from `anib` to `aniblastall` where necessary --- pyani/anib.py | 6 +- pyani/aniblastall.py | 125 +++++++++++++++++-- pyani/scripts/average_nucleotide_identity.py | 15 ++- tests/test_anib.py | 8 +- tests/test_aniblastall.py | 34 ++--- tests/test_concordance.py | 16 +-- tests/test_multiprocessing.py | 2 +- 7 files changed, 157 insertions(+), 49 deletions(-) diff --git a/pyani/anib.py b/pyani/anib.py index 86f23996..4b5ec3e3 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -246,7 +246,7 @@ def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict: def make_blastcmd_builder( - method: str, + # method: str, outdir: Path, format_exe: Optional[Path] = None, blast_exe: Optional[Path] = None, @@ -405,7 +405,9 @@ def construct_blastn_cmdline( :param outdir: Path, to the output directory :param blastn_exe: str, path to blastn executable """ - prefix = outdir / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}" + prefix = Path(outdir) / Path( + f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}" + ) return ( f"{blastn_exe} -out {prefix}.blast_tab -query {query} -db {subj_db} " "-xdrop_gap_final 150 -dust no -evalue 1e-15 -max_target_seqs 1 -outfmt " diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index b77a59ea..24c9f901 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -41,6 +41,7 @@ import re import shutil import subprocess +from logging import Logger import pandas as pd @@ -51,10 +52,11 @@ from . import pyani_config from . import pyani_jobs -from .pyani_tools import BLASTcmds, BLASTfunctions, BLASTexes +from .pyani_tools import BLASTcmds, BLASTfunctions, BLASTexes, ANIResults +from pyani import pyani_files -def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str: +def get_version(blastall_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str: r"""Return BLAST blastall version as a string. :param blast_exe: path to blastall executable @@ -76,18 +78,46 @@ def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str: - no version info returned - executable cannot be run on this OS """ - cmdline = [blast_exe, "-version"] - result = subprocess.run( - cmdline, # type: ignore - shell=False, - stdout=subprocess.PIPE, # type: ignore - stderr=subprocess.PIPE, - check=False, # blastall doesn't return 0 - ) + logger = logging.getLogger(__name__) + + try: + blastall_path = Path(shutil.which(blastall_exe)) # type:ignore + except TypeError: + return f"{blastall_exe} is not found in $PATH" + + if not blastall_path.is_file(): # no executable + return f"No blastall at {blastall_path}" + + # This should catch cases when the file can't be executed by the user + if not os.access(blastall_path, os.X_OK): # file exists but not executable + return f"blastall exists at {blastall_path} but not executable" + + if platform.system() == "Darwin": + cmdline = [blastall_exe, "-version"] + else: + cmdline = [blastall_exe] + + try: + result = subprocess.run( + cmdline, # type: ignore + shell=False, + stdout=subprocess.PIPE, # type: ignore + stderr=subprocess.PIPE, + check=False, # blastall doesn't return 0 + ) + + except OSError: + logger.warning("blastall executable will not run", exc_info=True) + return f"blastall exists at {blastall_path} but could not be executed" + version = re.search( # type: ignore r"(?<=blastall\s)[0-9\.]*", str(result.stderr, "utf-8") ).group() - return f"{platform.system()}_{version}" + + if 0 == len(version.strip()): + return f"blastall exists at {blastall_path} but could not retrieve version" + + return f"{platform.system()}_{version} ({blastall_path})" # Divide input FASTA sequences into fragments @@ -185,7 +215,7 @@ def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict: def make_blastcmd_builder( - method: str, + # method: str, outdir: Path, format_exe: Optional[Path] = None, blast_exe: Optional[Path] = None, @@ -268,6 +298,7 @@ def generate_blastdb_commands( filenames: List[Path], outdir: Path, blastdb_exe: Optional[Path] = None, + # mode: "ANIblastall" ) -> List[Tuple[str, Path]]: """Return list of makeblastdb command-lines for ANIblastall. @@ -295,7 +326,7 @@ def construct_formatdb_cmd( :param outdir: Path, path to output directory :param blastdb_exe: Path, path to the formatdb executable """ - newfilename = outdir / filename.name + newfilename = Path(outdir) / Path(filename.name) shutil.copy(filename, newfilename) return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename) @@ -436,3 +467,71 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: sim_errors = filtered["blast_mismatch"].sum() + filtered["blast_gaps"].sum() filtered.to_csv(Path(filename).with_suffix(".blast_tab.dataframe"), sep="\t") return aln_length, sim_errors, ani_pid + + +def process_blast( + blast_dir: Path, + org_lengths: Dict, + fraglengths: Dict, + mode: str = "ANIblastall", + logger: Optional[Logger] = None, +) -> ANIResults: + """Return tuple of ANIb results for .blast_tab files in the output dir. + :param blast_dir: Path, path to the directory containing .blast_tab files + :param org_lengths: Dict, the base count for each input sequence + :param fraglengths: dictionary of query sequence fragment lengths, only + needed for BLASTALL output + :param mode: str, analysis type (ANIb or ANIblastall) + :param logger: a logger for messages + Returns the following pandas dataframes in an ANIResults object; + query sequences are rows, subject sequences are columns: + - alignment_lengths - non-symmetrical: total length of alignment + - percentage_identity - non-symmetrical: ANIb (Goris) percentage identity + - alignment_coverage - non-symmetrical: coverage of query + - similarity_errors - non-symmetrical: count of similarity errors + May throw a ZeroDivisionError if one or more BLAST runs failed, or a + very distant sequence was included in the analysis. + """ + # Process directory to identify input files + blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab") + # Hold data in ANIResults object + results = ANIResults(list(org_lengths.keys()), mode) + + # Fill diagonal NA values for alignment_length with org_lengths + for org, length in list(org_lengths.items()): + results.alignment_lengths[org][org] = length + + # Process .blast_tab files assuming that the filename format holds: + # org1_vs_org2.blast_tab: + for blastfile in blastfiles: + qname, sname = blastfile.stem.split("_vs_") + + # We may have BLAST files from other analyses in the same directory + # If this occurs, we raise a warning, and skip the file + if qname not in list(org_lengths.keys()): + if logger: + logger.warning( + "Query name %s not in input sequence list, skipping %s", + qname, + blastfile, + ) + continue + if sname not in list(org_lengths.keys()): + if logger: + logger.warning( + "Subject name %s not in input sequence list, skipping %s", + sname, + blastfile, + ) + continue + resultvals = parse_blast_tab(blastfile, fraglengths) + query_cover = float(resultvals[0]) / org_lengths[qname] + + # Populate dataframes: when assigning data, we need to note that + # we have asymmetrical data from BLAST output, so only the + # upper triangle is populated + results.add_tot_length(qname, sname, resultvals[0], sym=False) + results.add_sim_errors(qname, sname, resultvals[1], sym=False) + results.add_pid(qname, sname, 0.01 * resultvals[2], sym=False) + results.add_coverage(qname, sname, query_cover) + return results diff --git a/pyani/scripts/average_nucleotide_identity.py b/pyani/scripts/average_nucleotide_identity.py index 10fc7767..44f6f26f 100755 --- a/pyani/scripts/average_nucleotide_identity.py +++ b/pyani/scripts/average_nucleotide_identity.py @@ -141,7 +141,7 @@ import pandas as pd from pyani import ( - anib, + # anib, anim, tetra, pyani_config, @@ -627,6 +627,7 @@ def make_sequence_fragments( for ANIb methods), and writes BLAST databases of these fragments, and fragment lengths of sequences, to local files. """ + global anib fragfiles, fraglengths = anib.fragment_fasta_files(infiles, blastdir, args.fragsize) # Export fragment lengths as JSON, in case we re-run with --skip_blastn fragpath = blastdir / "fraglengths.json" @@ -659,7 +660,7 @@ def run_blast( # Run BLAST database-building and executables from a jobgraph logger.info("Creating job dependency graph") jobgraph = anib.make_job_graph( - infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir) + infiles, fragfiles, anib.make_blastcmd_builder(blastdir) ) if args.scheduler == "multiprocessing": logger.info("Running dependency graph with multiprocessing") @@ -740,9 +741,7 @@ def unified_anib( # Process pairwise BLASTN output logger.info("Processing pairwise %s BLAST output.", args.method) try: - data = anib.process_blast( - blastdir, org_lengths, fraglengths=fraglengths, mode=args.method - ) + data = anib.process_blast(blastdir, org_lengths, fraglengths=fraglengths) except ZeroDivisionError: logger.error("One or more BLAST output files has a problem.") if not args.skip_blastn: @@ -958,6 +957,12 @@ def run_main(argsin: Optional[Namespace] = None) -> int: logger = logging.getLogger(__name__) config_logger(args) + # Conditionally load correct blast module + if args.method == "ANIb": + globals()["anib"] = __import__("pyani.anib", fromlist="pyani") + elif args.method == "ANIblastall": + globals()["anib"] = __import__("pyani.aniblastall", fromlist="pyani") + # Ensure argument validity and get method function/config test_class_label_paths(args, logger) test_scheduler(args, logger) diff --git a/tests/test_anib.py b/tests/test_anib.py index 874c3bf4..1710e84f 100644 --- a/tests/test_anib.py +++ b/tests/test_anib.py @@ -124,7 +124,7 @@ def anib_output_dir(dir_anib_in): # Test BLAST+ (blastn) command generation def test_blastn_dbjobdict(path_fna_all, tmp_path): """Generate dictionary of BLASTN+ database jobs.""" - blastcmds = anib.make_blastcmd_builder("ANIb", tmp_path) + blastcmds = anib.make_blastcmd_builder(tmp_path) jobdict = anib.build_db_jobs(path_fna_all, blastcmds) expected = [ ( @@ -139,7 +139,7 @@ def test_blastn_dbjobdict(path_fna_all, tmp_path): def test_blastn_graph(path_fna_all, tmp_path, fragment_length): """Create jobgraph for BLASTN+ jobs.""" fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length) - blastcmds = anib.make_blastcmd_builder("ANIb", tmp_path) + blastcmds = anib.make_blastcmd_builder(tmp_path) jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds) # We check that the main script job is a blastn job, and that there # is a single dependency, which is a makeblastdb job @@ -153,7 +153,7 @@ def test_blastn_graph(path_fna_all, tmp_path, fragment_length): def test_blastn_multiple(path_fna_two, tmp_path): """Generate BLASTN+ commands.""" # BLAST+ - cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIb") + cmds = anib.generate_blastn_commands(path_fna_two, tmp_path) expected = [ ( f"blastn -out {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} " @@ -207,7 +207,7 @@ def test_fragment_files(path_fna_all, tmp_path, dir_tgt_fragments, fragment_leng @pytest.mark.skip(reason="unsure this is needed") def test_makeblastdb_multiple(path_fna_two, tmp_path): """Generate multiple BLAST+ makeblastdb command-lines.""" - cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIb") + cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path) expected = [ ( ( diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py index 70ae8108..5c5c6da6 100644 --- a/tests/test_aniblastall.py +++ b/tests/test_aniblastall.py @@ -161,8 +161,8 @@ def test_get_version_os_incompatible(executable_incompatible_with_os): # Test legacy BLAST (blastall) command generation def test_blastall_dbjobdict(path_fna_all, tmp_path): """Generate dictionary of legacy BLASTN database jobs.""" - blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path) - jobdict = anib.build_db_jobs(path_fna_all, blastcmds) + blastcmds = aniblastall.make_blastcmd_builder(tmp_path) + jobdict = aniblastall.build_db_jobs(path_fna_all, blastcmds) expected = [ (tmp_path / _.name, f"formatdb -p F -i {tmp_path / _.name} -t {_.stem}") for _ in path_fna_all @@ -172,9 +172,11 @@ def test_blastall_dbjobdict(path_fna_all, tmp_path): def test_blastall_graph(path_fna_all, tmp_path, fragment_length): """Create jobgraph for legacy BLASTN jobs.""" - fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length) - blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path) - jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds) + fragresult = aniblastall.fragment_fasta_files( + path_fna_all, tmp_path, fragment_length + ) + blastcmds = aniblastall.make_blastcmd_builder(tmp_path) + jobgraph = aniblastall.make_job_graph(path_fna_all, fragresult[0], blastcmds) # We check that the main script job is a blastn job, and that there # is a single dependency, which is a makeblastdb job for job in jobgraph: @@ -185,7 +187,7 @@ def test_blastall_graph(path_fna_all, tmp_path, fragment_length): def test_blastall_multiple(path_fna_two, tmp_path): """Generate legacy BLASTN commands.""" - cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIblastall") + cmds = aniblastall.generate_blastn_commands(path_fna_two, tmp_path) expected = [ ( "blastall -p blastn -o " @@ -207,7 +209,9 @@ def test_blastall_multiple(path_fna_two, tmp_path): def test_blastall_single(path_fna_two, tmp_path): """Generate legacy BLASTN command-line.""" - cmd = anib.construct_blastall_cmdline(path_fna_two[0], path_fna_two[1], tmp_path) + cmd = aniblastall.construct_blastall_cmdline( + path_fna_two[0], path_fna_two[1], tmp_path + ) expected = ( f"blastall -p blastn -o {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} " f"-i {path_fna_two[0]} " @@ -220,7 +224,7 @@ def test_blastall_single(path_fna_two, tmp_path): # Test legacy BLAST database formatting (formatdb) command generation def test_formatdb_multiple(path_fna_two, tmp_path): """Generate legacy BLAST db creation commands.""" - cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIblastall") + cmds = aniblastall.generate_blastdb_commands(path_fna_two, tmp_path) expected = [ ( f"formatdb -p F -i {tmp_path / path_fna_two[0].name} -t {path_fna_two[0].stem}", @@ -236,7 +240,7 @@ def test_formatdb_multiple(path_fna_two, tmp_path): def test_formatdb_single(path_fna, tmp_path): """Generate legacy BLAST formatdb command-line.""" - cmd = anib.construct_formatdb_cmd(path_fna, tmp_path) + cmd = aniblastall.construct_formatdb_cmd(path_fna, tmp_path) expected = f"formatdb -p F -i {tmp_path / path_fna.name} -t {path_fna.stem}" assert cmd[0] == expected @@ -245,9 +249,9 @@ def test_formatdb_single(path_fna, tmp_path): def test_parse_legacy_blastdir(anib_output_dir): """Parses directory of legacy BLAST output.""" orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles) - fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles) - result = anib.process_blast( - anib_output_dir.legacyblastdir, orglengths, fraglengths, mode="ANIblastall" + fraglengths = aniblastall.get_fraglength_dict(anib_output_dir.fragfiles) + result = aniblastall.process_blast( + anib_output_dir.legacyblastdir, orglengths, fraglengths ) assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), @@ -257,10 +261,8 @@ def test_parse_legacy_blastdir(anib_output_dir): def test_parse_legacy_blasttab(anib_output): """Parses ANIB legacy .blast_tab output.""" - fragdata = anib.get_fraglength_dict([anib_output.fragfile]) - result = anib.parse_blast_tab( - anib_output.legacytabfile, fragdata, mode="ANIblastall" - ) + fragdata = aniblastall.get_fraglength_dict([anib_output.fragfile]) + result = aniblastall.parse_blast_tab(anib_output.legacytabfile, fragdata) assert ( a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018]) ) diff --git a/tests/test_concordance.py b/tests/test_concordance.py index dcd0d583..3a9fa1b5 100644 --- a/tests/test_concordance.py +++ b/tests/test_concordance.py @@ -55,7 +55,7 @@ import pytest from pyani import run_multiprocessing as run_mp -from pyani import anib, anim, tetra, pyani_files +from pyani import anib, aniblastall, anim, tetra, pyani_files def parse_jspecies(infile): @@ -227,13 +227,13 @@ def test_anib_concordance( paths_concordance_fna, tmp_path, fragment_length ) jobgraph = anib.make_job_graph( - paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIb", tmp_path) + paths_concordance_fna, fragfiles, anib.make_blastcmd_builder(tmp_path) ) assert 0 == run_mp.run_dependency_graph(jobgraph) # Jobs must run correctly # Process BLAST output result_pid = anib.process_blast( - tmp_path, orglengths, fraglengths, mode="ANIb" + tmp_path, orglengths, fraglengths ).percentage_identity # Compare JSpecies output to results. We do this in two blocks, @@ -266,19 +266,19 @@ def test_aniblastall_concordance( orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna) # Perform ANIblastall on the input directory contents - fragfiles, fraglengths = anib.fragment_fasta_files( + fragfiles, fraglengths = aniblastall.fragment_fasta_files( paths_concordance_fna, tmp_path, fragment_length ) - jobgraph = anib.make_job_graph( + jobgraph = aniblastall.make_job_graph( paths_concordance_fna, fragfiles, - anib.make_blastcmd_builder("ANIblastall", tmp_path), + aniblastall.make_blastcmd_builder(tmp_path), ) assert 0 == run_mp.run_dependency_graph(jobgraph) # Jobs must run correctly # Process BLAST output - result_pid = anib.process_blast( - tmp_path, orglengths, fraglengths, mode="ANIblastall" + result_pid = aniblastall.process_blast( + tmp_path, orglengths, fraglengths ).percentage_identity # Compare JSpecies output to results diff --git a/tests/test_multiprocessing.py b/tests/test_multiprocessing.py index 3757cd3a..f7e1dab1 100644 --- a/tests/test_multiprocessing.py +++ b/tests/test_multiprocessing.py @@ -96,7 +96,7 @@ def test_cmdsets(mp_dummy_cmds): def test_dependency_graph_run(path_fna_two, fragment_length, tmp_path): """Test that module runs dependency graph.""" fragresult = fragment_fasta_files(path_fna_two, tmp_path, fragment_length) - blastcmds = make_blastcmd_builder("ANIb", tmp_path) + blastcmds = make_blastcmd_builder(tmp_path) jobgraph = make_job_graph(path_fna_two, fragresult[0], blastcmds) result = run_dependency_graph(jobgraph) assert 0 == result From 6de8b0fe96c4fb8bae3b802237b762a17c510f98 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 21 Sep 2021 12:54:12 +0100 Subject: [PATCH 27/75] Add tests for `get_version()` These got removed somewhere? --- tests/test_anib.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_anib.py b/tests/test_anib.py index 1710e84f..e929b658 100644 --- a/tests/test_anib.py +++ b/tests/test_anib.py @@ -121,6 +121,34 @@ def anib_output_dir(dir_anib_in): ) +# Test get_version() +# Test case 1: there is no executable +def test_get_version_no_exe(executable_missing, monkeypatch): + """Test behaviour when there is no file at the specified executable location.""" + test_file_1 = Path("/non/existent/blastn") + assert anib.get_version(test_file_1) == f"No blastn executable at {test_file_1}" + + +# Test case 2: there is a file, but it is not executable +def test_get_version_exe_not_executable(executable_not_executable, monkeypatch): + """Test behaviour when the file at the executable location is not executable.""" + test_file_2 = Path("/non/executable/blastn") + assert ( + anib.get_version(test_file_2) + == f"blastn exists at {test_file_2} but not executable" + ) + + +# Test case 3: there is an executable file, but the version can't be retrieved +def test_get_version_exe_no_version(executable_without_version, monkeypatch): + """Test behaviour when the version for the executable can not be retrieved.""" + test_file_3 = Path("/missing/version/blastn") + assert ( + anib.get_version(test_file_3) + == f"blastn exists at {test_file_3} but could not retrieve version" + ) + + # Test BLAST+ (blastn) command generation def test_blastn_dbjobdict(path_fna_all, tmp_path): """Generate dictionary of BLASTN+ database jobs.""" From 7da06d938d0a1848a40b607f027b0db2edddb5f4 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 21:53:15 +0100 Subject: [PATCH 28/75] Fixed typos and put one variable name in camelcase --- pyani/scripts/subcommands/subcmd_anib.py | 53 ++++++++++++------------ 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index abf1cda6..f31bb895 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -79,7 +79,7 @@ # implementation class ComparisonJob(NamedTuple): - """Pairwise comparison job for the SQLAlchemy implementation""" + """Pairwise comparison job for the SQLAlchemy implementation.""" query: str subject: str @@ -219,7 +219,7 @@ def subcmd_anib(args: Namespace) -> None: "Compiling pairwise comparisons (this can take time for large datasets)..." ) comparisons = list(permutations(tqdm(genomes, disable=args.disable_tqdm), 2)) - logger.info(f"\t...total parwise comparisons to be performed: {len(comparisons)}") + logger.info(f"\t...total pairwise comparisons to be performed: {len(comparisons)}") # Check for existing comparisons; if one has already been done (for the same # software package, version, and setting) we add the comparison to this run, @@ -255,17 +255,17 @@ def subcmd_anib(args: Namespace) -> None: logger.debug( "\tIn this mode, existing comparison output from %s is reused", args.outdir ) - existingfiles = collect_existing_output(args.outdir, "blastn", args) - if existingfiles: + existing_files = collect_existing_output(args.outdir, "blastn", args) + if existing_files: logger.debug( "\tIdentified %s existing output files for reuse, %s (etc)", - len(existingfiles), - existingfiles[0], + len(existing_files), + existing_files[0], ) else: logger.debug("\tIdentified no existing output files") else: - existingfiles = list() + existing_files = list() logger.debug("\tAssuming no pre-existing output files") ## Split the input genome files into contiguous fragments of the specified size, @@ -286,7 +286,7 @@ def subcmd_anib(args: Namespace) -> None: # This method considered, but doesn't get fraglens # fragfiles = [file for file in fragdir.iterdir()] joblist = generate_joblist( - comparisons_to_run, existingfiles, fragfiles.values(), fraglens.values(), args + comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args ) logger.debug("...created %s blastn jobs", len(joblist)) # print(f"Joblist: {joblist}") @@ -308,7 +308,7 @@ def subcmd_anib(args: Namespace) -> None: def generate_joblist( comparisons: List, - existingfiles: List, + existing_files: List, fragfiles: List, fragsizes: List, args: Namespace, @@ -316,14 +316,14 @@ def generate_joblist( """Return list of ComparisonJobs. :param comparisons: list of (Genome, Genome) tuples for which comparisons are needed - :param existingfiles: list of pre-existing BLASTN+ outputs + :param existing_files: list of pre-existing BLASTN+ outputs :param fragfiles: list of files containing genome fragments :param fragsizes: list of fragment lengths :param args: Namespace, command-line arguments """ logger = logging.getLogger(__name__) - existingfiles = set(existingfiles) # Path objects hashable + existing_files = set(existing_files) # Path objects hashable joblist = [] # will hold ComparisonJob structs for idx, (query, subject) in enumerate( @@ -339,16 +339,17 @@ def generate_joblist( outfname = Path(outprefix + ".blast_tab") logger.debug("Expected output file for db: %s", outfname) - # If e're in recovery mode, we don't want to repeat a computational - # comparison that already exists, so we check whether the ultimate - # output is in the set of existing files and, if not, we add the obs + # If we are in recovery mode, we are salvaging output from a previous + # run, and do not necessarily need to rerun all the jobs. In this case, + # we prepare a list of output files we want to recover from the results + # in the output directory. # The comparisons collection always gets updated, so that results are # added to the database whether they come from recovery mode or are run # in this call of the script. - if args.recovery and outfname in existingfiles: + if args.recovery and outfname in existing_files: logger.debug("Recovering output from %s, not submitting job", outfname) - # Need to track the expected output, but set the job itself to None: + # Need to track the expected output, but set the job itself to None. joblist.append( ComparisonJob(query, subject, blastcmd, outfname, args.fragsize, None) ) @@ -449,18 +450,18 @@ def update_comparison_results( args: Namespace, ) -> None: """Update the Comparision table with the completed result set. - # - :param joblist: list of ComparisonJob namedtuples - :param run: Run ORM object for the current ANIb run - :param session: active pyanidb session via ORM - :param blastn_version: version of blastn used for the comparison - :param fraglens: dictionary of fragment lengths for each genome - :param args: command-line arguments for this run - # - The Comparison table stores individual comparison results, one per row. + + :param joblist: list of ComparisonJob namedtuples + :param run: Run ORM object for the current ANIb run + :param session: active pyanidb session via ORM + :param blastn_version: version of blastn used for the comparison + :param fraglens: dictionary of fragment lengths for each genome + :param args: command-line arguments for this run + + The Comparison table stores individual comparison results, one per row. """ logger = logging.getLogger(__name__) - # + # Add individual results to Comparison table for job in tqdm(joblist, disable=args.disable_tqdm): logger.debug("\t%s vs %s", job.query.description, job.subject.description) From fba3662c7384878c58e565e9212f96ffad022113 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 21:54:22 +0100 Subject: [PATCH 29/75] Fix which file is passed as input and which as blastdb --- pyani/scripts/subcommands/subcmd_anib.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index f31bb895..9a8418f8 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -201,7 +201,7 @@ def subcmd_anib(args: Namespace) -> None: fraglens.update({genome: fragsizes}) dbcmd, blastdbpath = anib.construct_makeblastdb_cmd( - fragpath, blastdbdir + Path(genome.path), blastdbdir # fragpath, blastdbdir ) # args.outdir) subprocess.run( @@ -329,10 +329,14 @@ def generate_joblist( for idx, (query, subject) in enumerate( tqdm(comparisons, disable=args.disable_tqdm) ): - # fname1 = Path(query.path) - # fname2 = Path(subject.path) + qprefix, qsuffix = ( + args.outdir / "fragments" / Path(query.path).stem, + Path(query.path).suffix, + ) + qfrags = Path(f"{qprefix}-fragments{qsuffix}") + blastcmd = anib.generate_blastn_commands( - Path(query.path), Path(subject.path), args.outdir, args.blastn_exe + qfrags, Path(subject.path), args.outdir, args.blastn_exe ) logger.debug("Commands to run:\n\t%s\n", blastcmd) outprefix = blastcmd.split()[2][:-10] # prefix for blastn output @@ -472,8 +476,8 @@ def update_comparison_results( Comparison( query=job.query, subject=job.subject, - aln_length=aln_length, - sim_errs=sim_errs, + aln_length=int(aln_length), + sim_errs=int(sim_errs), identity=ani_pid, cov_query=qcov, cov_subject=scov, From 1fb3c0830b953b98ba2c4f1010847253cd93cbcd Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 21:55:23 +0100 Subject: [PATCH 30/75] Move serialisation of fragment dictionary to primary function --- pyani/scripts/subcommands/subcmd_anib.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 9a8418f8..3f76cae8 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -212,7 +212,9 @@ def subcmd_anib(args: Namespace) -> None: check=False, ) - add_blastdb(session, genome, run, fragpath, blastdbpath, fragsizes, dbcmd) + add_blastdb( + session, genome, run, fragpath, blastdbpath, json.dumps(fragsizes), dbcmd + ) # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples logger.info( @@ -400,7 +402,7 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path # Write fragments to output file fragpath = outdir / f"{inpath.stem}-fragments.fna" SeqIO.write(outseqs, fragpath, "fasta") - return fragpath, json.dumps(sizedict) + return fragpath, sizedict def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: From 6c12a72c299cd2aa0ff9d012b8bef8392439ccae Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 21:56:07 +0100 Subject: [PATCH 31/75] Added useful debugging lines to `anib.py` May be removed later on --- pyani/scripts/subcommands/subcmd_anib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 3f76cae8..525e67e1 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -472,6 +472,8 @@ def update_comparison_results( for job in tqdm(joblist, disable=args.disable_tqdm): logger.debug("\t%s vs %s", job.query.description, job.subject.description) aln_length, sim_errs, ani_pid = anib.parse_blast_tab(job.outfile, fraglens) + logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}") + logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}") qcov = aln_length / job.query.length scov = aln_length / job.subject.length run.comparisons.append( From 419db64265d5c98c0e897f0fc98cff04e1ad019a Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 21:56:56 +0100 Subject: [PATCH 32/75] Add long versions of CLI flags to `anib_parser.py` --- pyani/scripts/parsers/anib_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyani/scripts/parsers/anib_parser.py b/pyani/scripts/parsers/anib_parser.py index 7473afc8..b2b99785 100644 --- a/pyani/scripts/parsers/anib_parser.py +++ b/pyani/scripts/parsers/anib_parser.py @@ -65,6 +65,7 @@ def build( ) parser.add_argument( "-i", + "--indir", action="store", dest="indir", default=None, @@ -73,6 +74,7 @@ def build( ) parser.add_argument( "-o", + "--outdir", action="store", dest="outdir", default=None, From fd41e4cbb0c5b79c05d3dbc5e9ce38cad59fe1df Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:00:32 +0100 Subject: [PATCH 33/75] Comment out (likely) unnecessary function Will be removed later --- pyani/anib.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/pyani/anib.py b/pyani/anib.py index 4b5ec3e3..65b40c60 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -324,26 +324,26 @@ def make_job_graph( return joblist -# Generate list of makeblastdb command lines from passed filenames -def generate_blastdb_commands( - filenames: List[Path], - outdir: Path, - blastdb_exe: Optional[Path] = None, -) -> List[Tuple[str, Path]]: - """Return list of makeblastdb command-lines for ANIb. - - :param filenames: a list of paths to input FASTA files - :param outdir: path to output directory - :param blastdb_exe: path to the makeblastdb executable - :param method: str, ANIb analysis type (ANIb) - """ - if blastdb_exe is None: - cmdlines = [construct_makeblastdb_cmd(fname, outdir) for fname in filenames] - else: - cmdlines = [ - construct_makeblastdb_cmd(fname, outdir, blastdb_exe) for fname in filenames - ] - return cmdlines +## Generate list of makeblastdb command lines from passed filenames +# def generate_blastdb_commands( +# filenames: List[Path], +# outdir: Path, +# blastdb_exe: Optional[Path] = None, +# ) -> List[Tuple[str, Path]]: +# """Return list of makeblastdb command-lines for ANIb. +# +# :param filenames: a list of paths to input FASTA files +# :param outdir: path to output directory +# :param blastdb_exe: path to the makeblastdb executable +# :param method: str, ANIb analysis type (ANIb) +# """ +# if blastdb_exe is None: +# cmdlines = [construct_makeblastdb_cmd(fname, outdir) for fname in filenames] +# else: +# cmdlines = [ +# construct_makeblastdb_cmd(fname, outdir, blastdb_exe) for fname in filenames +# ] +# return cmdlines # Generate single makeblastdb command line From 104178aa748f896e21815a6e9319673c5b7b4e14 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:00:58 +0100 Subject: [PATCH 34/75] Update docstrings to reflect new comparison handling --- pyani/anib.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyani/anib.py b/pyani/anib.py index 65b40c60..5683fd20 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -254,7 +254,6 @@ def make_blastcmd_builder( ) -> BLASTcmds: """Return BLASTcmds object for construction of BLAST commands. - :param method: str, the kind of ANI analysis (ANIb) :param outdir: :param format_exe: :param blast_exe: @@ -400,8 +399,8 @@ def construct_blastn_cmdline( ) -> str: """Return a single blastn command. - :param fname1: Path, FASTA file for query genome - :param fname2: Path, database of fragments for subject genome + :param query: Path, FASTA file for query genome + :param subj_db: Path, database of fragments for subject genome :param outdir: Path, to the output directory :param blastn_exe: str, path to blastn executable """ From 37477db937903cee676b451bf033b20c89f3f553 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:01:16 +0100 Subject: [PATCH 35/75] Add useful debugging lines May be removed later --- pyani/anib.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyani/anib.py b/pyani/anib.py index 5683fd20..43101ebc 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -83,6 +83,7 @@ import os import platform +import logging import re import shutil import subprocess @@ -509,6 +510,8 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: over an alignable region of at least 70% of their length. ''' """ + logger = logging.getLogger(__name__) + # Load output as dataframe columns = [ "sbjct_id", @@ -535,6 +538,8 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: data.columns = columns except pd.io.common.EmptyDataError: data = pd.DataFrame(columns=columns) + # logger.debug(f"Index: {[idx for idx in data.index]}") + logger.debug(f"data: {data.head()}") # Add new columns for recalculated alignment length, proportion, and # percentage identity data["ani_alnlen"] = data["blast_alnlen"] - data["blast_gaps"] From 8cf205b0881a9ceca7d1de0e2439b61a8be31dd3 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:01:34 +0100 Subject: [PATCH 36/75] Populate `aniblastall_parser.py` --- pyani/scripts/parsers/aniblastall_parser.py | 62 +++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/pyani/scripts/parsers/aniblastall_parser.py b/pyani/scripts/parsers/aniblastall_parser.py index 10283210..364769ec 100644 --- a/pyani/scripts/parsers/aniblastall_parser.py +++ b/pyani/scripts/parsers/aniblastall_parser.py @@ -39,10 +39,14 @@ """Provides parser for aniblastall subcommand.""" from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, _SubParsersAction +from pathlib import Path from typing import List, Optional +from pyani import pyani_config from pyani.scripts import subcommands +from pyani import __version__ + def build( subps: _SubParsersAction, parents: Optional[List[ArgumentParser]] = None @@ -51,8 +55,66 @@ def build( :param subps: collection of subparsers in main parser :param parents: parsers from which arguments are inherited + + The terminology may be confusing, but in practice the main parser collects + command-line arguments that are then available to this parser, which inherits + options from the parsers in `parents` in addition to those defined below. """ parser = subps.add_parser( "aniblastall", parents=parents, formatter_class=ArgumentDefaultsHelpFormatter ) + # Required arguments + parser.add_argument( + "-i", + "--indir", + action="store", + dest="indir", + default=None, + required=True, + type=Path, + help="input genome directory (required)", + ) + parser.add_argument( + "-o", + "--outdir", + action="store", + dest="outdir", + default=None, + required=True, + type=Path, + help="output analysis results directory (required)", + ) + # Optional arguments + parser.add_argument( + "--dbpath", + action="store", + dest="dbpath", + default=Path(".pyani/pyanidb"), + type=Path, + help="path to pyani database", + ) + parser.add_argument( + "--blastall_exe", + action="store", + dest="blastall_exe", + default=pyani_config.BLASTALL_DEFAULT, + type=Path, + help="path to blastall executable", + ) + parser.add_argument( + "--format_exe", + dest="format_exe", + action="store", + default=pyani_config.FORMATDB_DEFAULT, + type=Path, + help="path to formatdb executable", + ) + parser.add_argument( + "--fragsize", + dest="fragsize", + action="store", + type=int, + default=pyani_config.FRAGSIZE, + help="blastn query fragment size", + ) parser.set_defaults(func=subcommands.subcmd_aniblastall) From afd1be15a48254d518fca7657f1043f495c59de5 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:15:34 +0100 Subject: [PATCH 37/75] Comment out (likely) unnecessary function Will be removed later --- pyani/aniblastall.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index 24c9f901..ed2b9cdb 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -293,27 +293,26 @@ def make_job_graph( return joblist -# Generate list of makeblastdb command lines from passed filenames -def generate_blastdb_commands( - filenames: List[Path], - outdir: Path, - blastdb_exe: Optional[Path] = None, - # mode: "ANIblastall" -) -> List[Tuple[str, Path]]: - """Return list of makeblastdb command-lines for ANIblastall. - - :param filenames: a list of paths to input FASTA files - :param outdir: path to output directory - :param blastdb_exe: path to the makeblastdb executable - :param method: str, ANI analysis type (ANIblastall) - """ - if blastdb_exe is None: - cmdlines = [construct_formatdb_cmd(fname, outdir) for fname in filenames] - else: - cmdlines = [ - construct_formatdb_cmd(fname, outdir, blastdb_exe) for fname in filenames - ] - return cmdlines +## Generate list of makeblastdb command lines from passed filenames +# def generate_blastdb_commands( +# filenames: List[Path], +# outdir: Path, +# blastdb_exe: Optional[Path] = None, +# ) -> List[Tuple[str, Path]]: +# """Return list of makeblastdb command-lines for ANIblastall. +# +# :param filenames: a list of paths to input FASTA files +# :param outdir: path to output directory +# :param blastdb_exe: path to the makeblastdb executable +# :param method: str, ANI analysis type (ANIblastall) +# """ +# if blastdb_exe is None: +# cmdlines = [construct_formatdb_cmd(fname, outdir) for fname in filenames] +# else: +# cmdlines = [ +# construct_formatdb_cmd(fname, outdir, blastdb_exe) for fname in filenames +# ] +# return cmdlines # Generate single makeblastdb command line From 7dc1113deaaca649ef27ace1d40b2ceaf5f71f03 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:22:53 +0100 Subject: [PATCH 38/75] Simplify generation of command lines and update supporting docstrings --- pyani/aniblastall.py | 47 +++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index ed2b9cdb..cd5cdd06 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -331,14 +331,17 @@ def construct_formatdb_cmd( # Generate list of BLASTN command lines from passed filenames -def generate_blastn_commands( - filenames: List[Path], +def generate_blastall_commands( + # filenames: List[Path], + query: Path, + subject: Path, outdir: Path, - blast_exe: Optional[Path] = None, + blastall_exe: Optional[Path] = None, ) -> List[str]: """Return a list of blastn command-lines for ANIblastall. - :param filenames: a list of paths to fragmented input FASTA files + :param query: a paths to the query's fragmented input FASTA file + :param subject: a paths to the subject's fragmented input FASTA file :param outdir: path to output directory :param blastn_exe: path to BLASTN executable :param method: str, analysis type (ANIblastall) @@ -348,41 +351,31 @@ def generate_blastn_commands( have the form ACCESSION.ext. This is the convention followed by the fragment_FASTA_files() function above. """ - cmdlines = [] - for idx, fname1 in enumerate(filenames[:-1]): - dbname1 = Path(str(fname1).replace("-fragments", "")) - for fname2 in filenames[idx + 1 :]: - dbname2 = Path(str(fname2).replace("-fragments", "")) - if blast_exe is None: - cmdlines.append(construct_blastall_cmdline(fname1, dbname2, outdir)) - cmdlines.append(construct_blastall_cmdline(fname2, dbname1, outdir)) - else: - cmdlines.append( - construct_blastall_cmdline(fname1, dbname2, outdir, blast_exe) - ) - cmdlines.append( - construct_blastall_cmdline(fname2, dbname1, outdir, blast_exe) - ) - return cmdlines + subj_db = outdir / "blastalldbs" / Path(str(subject.name)) + if blastall_exe is None: + cmdline = construct_blastall_cmdline(query, subj_db, outdir) + else: + cmdline = construct_blastall_cmdline(query, subj_db, outdir, blastall_exe) + return cmdline # Generate single BLASTALL command line def construct_blastall_cmdline( - fname1: Path, - fname2: Path, + query: Path, + subj_db: Path, outdir: Path, blastall_exe: Path = pyani_config.BLASTALL_DEFAULT, ) -> str: """Return single blastall command. - :param fname1: - :param fname2: - :param outdir: :param blastall_exe: str, path to BLASTALL executable + :param query: Path, FASTA file for query genome + :param subj_db: Path, database of fragments for subject genome + :param outdir: Path, to the output directory """ - prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}" + prefix = Path(outdir) / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}" return ( - f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {fname1} -d {fname2} " + f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {query} -d {subj_db} " "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8" ) From ee90a68a23e40346e612067389c5c0753a43aeef Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:23:29 +0100 Subject: [PATCH 39/75] Update command name referenced in docstrings --- pyani/aniblastall.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index cd5cdd06..33eac9d7 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -223,7 +223,6 @@ def make_blastcmd_builder( ) -> BLASTcmds: """Return BLASTcmds object for construction of BLAST commands. - :param method: str, the kind of ANI analysis (ANIblastall) :param outdir: :param format_exe: :param blast_exe: @@ -343,7 +342,7 @@ def generate_blastall_commands( :param query: a paths to the query's fragmented input FASTA file :param subject: a paths to the subject's fragmented input FASTA file :param outdir: path to output directory - :param blastn_exe: path to BLASTN executable + :param blastall_exe: path to BLASTALL executable :param method: str, analysis type (ANIblastall) Assumes that the fragment sequence input filenames have the form @@ -368,10 +367,10 @@ def construct_blastall_cmdline( ) -> str: """Return single blastall command. - :param blastall_exe: str, path to BLASTALL executable :param query: Path, FASTA file for query genome :param subj_db: Path, database of fragments for subject genome :param outdir: Path, to the output directory + :param blastall_exe: str, path to blastall executable """ prefix = Path(outdir) / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}" return ( From e8f6e417d1ac18a489fd1b744bafc3ff2775ea9e Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:24:11 +0100 Subject: [PATCH 40/75] Fix which files are passed as input and blastdb in command generation --- pyani/aniblastall.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index 33eac9d7..03f50299 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -324,7 +324,9 @@ def construct_formatdb_cmd( :param outdir: Path, path to output directory :param blastdb_exe: Path, path to the formatdb executable """ - newfilename = Path(outdir) / Path(filename.name) + newfilename = ( + Path(outdir) / Path(filename).name + ) # Path(filename.name.replace("-fragments", "")) shutil.copy(filename, newfilename) return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename) From c0c518aaccd6a3547380021ed689cc5413453013 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:24:46 +0100 Subject: [PATCH 41/75] Add useful debugging lines May be removed later --- pyani/aniblastall.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index 03f50299..6e3f4899 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -324,10 +324,12 @@ def construct_formatdb_cmd( :param outdir: Path, path to output directory :param blastdb_exe: Path, path to the formatdb executable """ + logger = logging.getLogger(__name__) newfilename = ( Path(outdir) / Path(filename).name ) # Path(filename.name.replace("-fragments", "")) shutil.copy(filename, newfilename) + logger.debug(f"mv {filename} {newfilename}") return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename) @@ -401,10 +403,17 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: over an alignable region of at least 70% of their length. ''' """ + logger = logging.getLogger(__name__) + # Assuming that the filename format holds org1_vs_org2.blast_tab: + logger.debug(f"Filename: {filename}") qname = filename.stem.split("_vs_")[0] + logger.debug(f"Qname: {qname}") # Load output as dataframe + logger.debug(f"Keys: {fraglengths.keys()}") qfraglengths = fraglengths[qname] + # logger.debug(f"Qkeys: {qfraglengths.keys()}") + logger.debug(f"Qfraglengths: {type(qfraglengths)}") columns = [ "sid", "blast_pid", @@ -422,12 +431,14 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: # regions of homology. This causes pandas to throw an error on CSV import. # To get past this, we create an empty dataframe with the appropriate # columns. + # logger.debug(pd.read_csv(filename, header=None, index_col=0)) try: data = pd.read_csv(filename, header=None, sep="\t", index_col=0) data.columns = columns except pd.io.common.EmptyDataError: data = pd.DataFrame(columns=columns) # Add new column for fragment length, only for BLASTALL + logger.debug(f"Data: {data.head()}") data["qlen"] = pd.Series( [qfraglengths[idx] for idx in data.index], index=data.index ) @@ -437,6 +448,8 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: data["ani_alnids"] = data["ani_alnlen"] - data["blast_mismatch"] data["ani_coverage"] = data["ani_alnlen"] / data["qlen"] data["ani_pid"] = data["ani_alnids"] / data["qlen"] + logger.debug(f"data: {data.head()}") + # Filter rows on 'ani_coverage' > 0.7, 'ani_pid' > 0.3 filtered = data[(data["ani_coverage"] > 0.7) & (data["ani_pid"] > 0.3)] # Dedupe query hits, so we only take the best hit @@ -457,8 +470,10 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]: if pd.isnull(ani_pid): # Happens if there are no matches in ANIb ani_pid = 0 aln_length = filtered["ani_alnlen"].sum() + logger.debug(f"Alignment length, aniblastall.py: {aln_length.sum()}") sim_errors = filtered["blast_mismatch"].sum() + filtered["blast_gaps"].sum() filtered.to_csv(Path(filename).with_suffix(".blast_tab.dataframe"), sep="\t") + logger.debug(f"Results, aniblastall.py: {aln_length}, {sim_errors}, {ani_pid}") return aln_length, sim_errors, ani_pid @@ -485,6 +500,7 @@ def process_blast( May throw a ZeroDivisionError if one or more BLAST runs failed, or a very distant sequence was included in the analysis. """ + logger = logging.getLogger(__name__) # Process directory to identify input files blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab") # Hold data in ANIResults object @@ -518,6 +534,7 @@ def process_blast( ) continue resultvals = parse_blast_tab(blastfile, fraglengths) + logger.debug(f"Resultvals: {resultvals}") query_cover = float(resultvals[0]) / org_lengths[qname] # Populate dataframes: when assigning data, we need to note that From dd1ee44b66ae65b821b89842994b414ff6b0b1a3 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:33:12 +0100 Subject: [PATCH 42/75] Add imports and convenience struct for `ComparisonJob` --- .../scripts/subcommands/subcmd_aniblastall.py | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index f91208cd..077860db 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -39,11 +39,58 @@ # THE SOFTWARE. """Provides the aniblastall subcommand for pyani.""" +import datetime +import json +import logging +import os +import subprocess + from argparse import Namespace from logging import Logger +from itertools import permutations +from pathlib import Path +from typing import List, NamedTuple, Tuple, Dict + +from Bio import SeqIO +from tqdm import tqdm + +from pyani import ( + PyaniException, + aniblastall, + pyani_config, + pyani_jobs, + run_sge, + run_multiprocessing as run_mp, +) +from pyani.pyani_files import collect_existing_output +from pyani.pyani_orm import ( + add_run, + add_run_genomes, + add_blastdb, + Comparison, + filter_existing_comparisons, + get_session, + PyaniORMException, + update_comparison_matrices, +) +from pyani.pyani_tools import termcolor + + +# Convenience struct describing a pairwise comparison job for the SQLAlchemy +# implementation +class ComparisonJob(NamedTuple): + + """Pairwise comparison job for the SQLAlchemy implementation.""" + + query: str + subject: str + blastcmd: str + outfile: Path + fragsize: int + job: pyani_jobs.Job -def subcmd_aniblastall(args: Namespace): +def subcmd_aniblastall(args: Namespace) -> None: """Perform ANIblastall on all genome files in an input directory. :param args: From 8cfbfe8b16d4385ca61c2374c7b80428db3fdc24 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:35:19 +0100 Subject: [PATCH 43/75] Add initial code for starting a run and adding it to the database --- .../scripts/subcommands/subcmd_aniblastall.py | 51 ++++++++++++++++++- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 077860db..bcdffd53 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -93,7 +93,54 @@ class ComparisonJob(NamedTuple): def subcmd_aniblastall(args: Namespace) -> None: """Perform ANIblastall on all genome files in an input directory. - :param args: + :param args: Namespace, command-line arguments :param logger: + + Finds ANI by the ANIblastall method, as described in ... some paper. + + All FASTA format files (selected by suffix) in the input directory are fragmented into (by default 1020nt) consecutive sections, and a BLASTALL database constructed from the whole genome input. The BLAST+ blastall tool is then used to query each set of fragments against each BLAST+ database, in turn. + + For each query, the BLAST+ .tab output is parsed to obtain alignment length, identity and similarity error count. Alignments below a threshold are not included in the calculation (this introduces systematic bias with respect to ANIm). The results are processed to calculate the ANI percentages, coverage, and similarity error. + + The calculated values are stored in the local SQLite3 database. """ - raise NotImplementedError + # Create logger + logger = logging.getLogger(__name__) + + # Announce the analysis + logger.info(termcolor("Running ANIblastall analysis", "red")) + + # Get BLASTALL version - this will be used in the database entreis + blastall_version = aniblastall.get_version(args.blastall_exe) + logger.info(termcolor(f"BLAST+ blastall version: {blastall_version}", "cyan")) + + # Use provided name, or make new one for this analysis + start_time = datetime.datetime.now() + name = args.name or "_".join(["ANIblastall", start_time.isoformat()]) + logger.info(termcolor(f"Analysis name: {name}", "cyan")) + + # Connect to existing database (which may be "clean" or have old analyses) + logger.debug(f"Connecting to database {args.dbpath}") + try: + session = get_session(args.dbpath) + except Exception: + logger.error( + f"Could not connect to database {args.dbpath} (exiting)", exc_info=True + ) + raise SystemExit(1) + + # Add information about this run to the database + logger.debug(f"Adding run info to database {args.dbpath}...") + try: + run = add_run( + session, + method="ANIblastall", + cmdline=args.cmdline, + date=start_time, + status="started", + name=name, + ) + except PyaniORMException: + logger.error("Could not add run to the database (exiting)", exc_info=True) + raise SystemExit(1) + logger.debug(f"\t...added run ID: {run} to the database") From 2aa686b5140cb96debcd55fabd8b3319fb0c77c6 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:36:03 +0100 Subject: [PATCH 44/75] Add genomes for the ru to the database --- pyani/scripts/subcommands/subcmd_aniblastall.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index bcdffd53..ac186a51 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -144,3 +144,16 @@ def subcmd_aniblastall(args: Namespace) -> None: logger.error("Could not add run to the database (exiting)", exc_info=True) raise SystemExit(1) logger.debug(f"\t...added run ID: {run} to the database") + + # Identify input files for comparison, and populate the database + logger.debug(f"Adding files for {run} to database...") + try: + genome_ids = add_run_genomes( + session, run, args.indir, args.classes, args.labels + ) + except PyaniORMException: + logger.error( + f"Could not add genomes to database for run {run} (exiting)", exc_info=True + ) + raise SystemExit(1) + logger.debug(f"\t...added gnome IDs: {genome_ids}") From 10bd2a27296422bd6fd9626969fe0a82c580baf2 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:43:16 +0100 Subject: [PATCH 45/75] Get genomes for the run and create output directories --- .../scripts/subcommands/subcmd_aniblastall.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index ac186a51..53448458 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -157,3 +157,24 @@ def subcmd_aniblastall(args: Namespace) -> None: ) raise SystemExit(1) logger.debug(f"\t...added gnome IDs: {genome_ids}") + + # Get list of genomes for this anlaysis from the database + logger.info("Compiling genomes for comparison") + genomes = run.genomes.all() + logger.debug(f"\tCollected {len(genomes)} genomes for this run") + + # Create output directories. We create the amin parent directory (args.outdir), but + # also subdirectories for the BLAST databases. + logger.debug(f"Creating output directory {args.outdir}") + try: + os.makedirs(args.outdir, exist_ok=True) + except IOError: + logger.error( + f"Could not create output directory {args.outdir} (exiting)", exc_info=True + ) + raise SystemError(1) + fragdir = Path(str(args.outdir)) / "fragments" + blastdbdir = Path(str(args.outdir)) / "blastalldbs" + logger.debug("\t...creating subdirectories") + os.makedirs(fragdir, exist_ok=True) + os.makedirs(blastdbdir, exist_ok=True) From 8df94c7be8a965d5f4211faa3b574f108c2193ec Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:49:34 +0100 Subject: [PATCH 46/75] Add code to fragment fasta files --- .../scripts/subcommands/subcmd_aniblastall.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 53448458..57505291 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -178,3 +178,49 @@ def subcmd_aniblastall(args: Namespace) -> None: logger.debug("\t...creating subdirectories") os.makedirs(fragdir, exist_ok=True) os.makedirs(blastdbdir, exist_ok=True) + + # Create a new sequence fragment file and a new BLAST+ database for each input genome, + # and add this data to the database as a row in BlastDB + logger.info("Creating input sequence fragment files") + fragfiles = {} + fraglens = {} + for genome in genomes: + fragpath, fragsizes = fragment_fasta_file( + Path(str(genome.path)), Path(str(fragdir)), args.fragsize + ) + logger.info(f"fragsizes: {type(fragsizes)}") + fragfiles.update({Path(genome.path).stem: fragpath}) + fraglens.update({Path(genome.path).stem: fragsizes}) + + +def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]: + """Return path to fragmented sequence file and JSON of fragment lengths. + + :param inpath: Path to genome file + :param outdir: Path to directory to hold fragmented files + :param fragsize: size of genome fragments + + Returns a tuple of ``(path, json)`` where ``path`` is the path to the fragment + file and ``json`` is a JSON-ified dictionary of fragment lengths, keyed by + fragment sequence ID. + """ + # Generate fragments for the input sequence, looping over each contig/ + # chromosome in the input file and breaking into sections of length + # fragsize + sizedict = {} + outseqs = [] + count = 0 + for seq in SeqIO.parse(inpath, "fasta"): + idx = 0 + while idx < len(seq): + count += 1 + newseq = seq[idx : idx + fragsize] + newseq.id = f"frag{count:05d}" + outseqs.append(newseq) + sizedict[newseq.id] = len(newseq) + idx += fragsize + + # Write fragments to output file + fragpath = outdir / f"{inpath.stem}-fragments.fna" + SeqIO.write(outseqs, fragpath, "fasta") + return fragpath, sizedict From e42c0890eb281ee05fbbc236f0c344fb2936f79e Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:50:09 +0100 Subject: [PATCH 47/75] Add code to create `blastdb` --- .../scripts/subcommands/subcmd_aniblastall.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 57505291..7d06bae9 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -192,6 +192,25 @@ def subcmd_aniblastall(args: Namespace) -> None: fragfiles.update({Path(genome.path).stem: fragpath}) fraglens.update({Path(genome.path).stem: fragsizes}) + logger.info("Constructing formatdb command") + dbcmd, blastdbpath = aniblastall.construct_formatdb_cmd( + Path(genome.path), blastdbdir + ) + + logger.info("Running subprocess") + subprocess.run( + dbcmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + + logger.info("Adding blastdb") + add_blastdb( + session, genome, run, fragpath, blastdbpath, json.dumps(fragsizes), dbcmd + ) + def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]: """Return path to fragmented sequence file and JSON of fragment lengths. From 05665ee8b56263f7882190f72ead28ad7ee5e06e Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:51:13 +0100 Subject: [PATCH 48/75] Add code to generate list of comparisons and filter existing ones --- .../scripts/subcommands/subcmd_aniblastall.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 7d06bae9..e3251c5a 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -211,6 +211,24 @@ def subcmd_aniblastall(args: Namespace) -> None: session, genome, run, fragpath, blastdbpath, json.dumps(fragsizes), dbcmd ) + # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples + logger.info( + "Compiling pairwise comparisons (this can take time for large datasets)..." + ) + comparisons = list(permutations(tqdm(genomes, disable=args.disable_tqdm), 2)) + logger.info(f"\t...total pairwise comparisons to be performed: {len(comparisons)}") + + # Check for existing comparisons; if one has already been done (for the same + # software package, version, and setting) we add the comparison to this run, + # but remove it from the list of comparisons to be performed + logger.info("Checking database for existing comparison data...") + comparisons_to_run = filter_existing_comparisons( + session, run, comparisons, "blastall", blastall_version, args.fragsize, False + ) + logger.info( + f"\t...after check, still need to run {len(comparisons_to_run)} comparisons" + ) + def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]: """Return path to fragmented sequence file and JSON of fragment lengths. From 502becf8bfc4520eb5ca83c7ff7bf0bee96539ef Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:51:48 +0100 Subject: [PATCH 49/75] Add code for case where all comparisons have been run already --- pyani/scripts/subcommands/subcmd_aniblastall.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index e3251c5a..5b2da3a5 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -229,6 +229,19 @@ def subcmd_aniblastall(args: Namespace) -> None: f"\t...after check, still need to run {len(comparisons_to_run)} comparisons" ) + # If there are no comparisons to run, update the Run matrices and exit + # from this function + if not comparisons_to_run: + logger.info( + termcolor( + "All comparison results present in database (skipping comparisons)", + "magenta", + ) + ) + logger.info("Updating summary matrices with existing results") + update_comparison_matrices(session, run) + return + def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]: """Return path to fragmented sequence file and JSON of fragment lengths. From 3589efc509087ca2732c27165bc0a6dff1f8cb76 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 22:53:22 +0100 Subject: [PATCH 50/75] Add code for recovery mode --- .../scripts/subcommands/subcmd_aniblastall.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 5b2da3a5..76023372 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -242,6 +242,27 @@ def subcmd_aniblastall(args: Namespace) -> None: update_comparison_matrices(session, run) return + # If we are in recovery mode, we are salvaging output from a previous + # run, and do not necessarily need to rerun all the jobs. In this case, + # we prepare a list of output files we want to recover from the results + # in the output directory. + # ΒΆ Should this use output files, or pull from the database? + if args.recovery: + logger.warning("Entering recovery mode...") + logger.debug( + f"\tIn this mode, existing comparison output from {args.outdir} is reused" + ) + existing_files = collect_existing_output(args.outdir, "blastall", args) + if existing_files: + logger.debug( + f"\tIdentified {len(existing_files)} existing output files for reuse, existing_files[0] (et cetera)" + ) + else: + logger.debug("\tIdentified no existing output files") + else: + existing_files = list() + logger.debug("\tAssuming no pre-existing output files") + def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]: """Return path to fragmented sequence file and JSON of fragment lengths. From c74bf3a11ada66020ea2938ee96dfb7bf6103a92 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:03:40 +0100 Subject: [PATCH 51/75] Rename `blastcmd` to `blastallcmd` --- pyani/scripts/subcommands/subcmd_aniblastall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 76023372..95e65dbc 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -84,7 +84,7 @@ class ComparisonJob(NamedTuple): query: str subject: str - blastcmd: str + blastallcmd: str outfile: Path fragsize: int job: pyani_jobs.Job From 953be895f3ac790d71a34b9c9307996ee46e6553 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:07:44 +0100 Subject: [PATCH 52/75] Add `generate_joblist()` --- .../scripts/subcommands/subcmd_aniblastall.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 95e65dbc..37b35503 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -264,6 +264,73 @@ def subcmd_aniblastall(args: Namespace) -> None: logger.debug("\tAssuming no pre-existing output files") +def generate_joblist( + comparisons: List, + existing_files: List, + fragfiles: List, + fragsizes: List, + args: Namespace, +) -> List[ComparisonJob]: + """Return list of ComparisonJobs. + + :param comparisons: list of (Genome, Genome) tuples for which comparisons are needed + :param existing_files: list of pre-existing BLASTALL outputs + :param fragfiles: list of files containing genome fragments + :param fragsizes: list of fragment lengths + :param args: Namespace, command-line arguments + """ + logger = logging.getLogger(__name__) + + existing_files = set(existing_files) # Path objects hashable + + joblist = [] # will hold ComparisonJob structs + for idx, (query, subject) in enumerate( + tqdm(comparisons, disable=args.disable_tqdm) + ): + qprefix, qsuffix = ( + args.outdir / "fragments" / Path(query.path).stem, + Path(query.path).suffix, + ) + qfrags = Path(f"{qprefix}-fragments{qsuffix}") + + blastallcmd = aniblastall.generate_blastall_commands( + qfrags, Path(subject.path), args.outdir, args.blastall_exe + ) + logger.debug(f"Commands to run:\n\t{blastallcmd}\n") + outprefix = blastallcmd.split()[4][:-10] # prefix for blastall output + outfname = Path(outprefix + ".blast_tab") + logger.debug(f"Expected output file for db: {outfname}") + + # If we are in recovery mode, we are salvaging output from a previous + # run, and do not necessarily need to rerun all the jobs. In this case, + # we prepare a list of output files we want to recover from the results + # in the output directory. + + # The comparisons collection always gets updated, so that results are + # added to the database whether they come from recovery mode or are run + # in this call of the script. + if args.recovery and outfname in existing_files: + logger.debug(f"Recovering output from {outfname}, not submitting job") + # Need to track the expected output, but set the job itself to None. + joblist.append( + ComparisonJob( + query, subject, blastallcmd, outfname, args.fragsize, None + ) + ) + else: + logger.debug("Building job") + # Build job + blastalljob = pyani_jobs.Job( + "{args.jobprefix}_{idx:06d}-blastall", blastallcmd + ) + joblist.append( + ComparisonJob( + query, subject, blastallcmd, outfname, args.fragsize, blastalljob + ) + ) + return joblist + + def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]: """Return path to fragmented sequence file and JSON of fragment lengths. From 1971d25bb9114c722dd64dda881b070dfbcb3d74 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:08:07 +0100 Subject: [PATCH 53/75] Add code to generate `joblist` --- pyani/scripts/subcommands/subcmd_aniblastall.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 37b35503..0c3bacd1 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -263,6 +263,15 @@ def subcmd_aniblastall(args: Namespace) -> None: existing_files = list() logger.debug("\tAssuming no pre-existing output files") + # Create list of BLASTALL jobs for each comparison still to be performed + logger.info("Creating blastall jobs for ANIblastall...") + # This method considered, but doesn't get fraglens + # fragfiles = [file for file in fragdir.iterdir()] + joblist = generate_joblist( + comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args + ) + logger.debug(f"...created {len(joblist)} blastall jobs") + def generate_joblist( comparisons: List, From 641a843ccd2662ca3c172ba5fcf825d9cdfc40d8 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:09:20 +0100 Subject: [PATCH 54/75] Add code to run comparisons --- .../scripts/subcommands/subcmd_aniblastall.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 0c3bacd1..e312afc0 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -272,6 +272,11 @@ def subcmd_aniblastall(args: Namespace) -> None: ) logger.debug(f"...created {len(joblist)} blastall jobs") + # Pass jobs to appropriate scheduler + logger.debug(f"Passing {len(joblist)} jobs to {args.scheduler}") + run_aniblastall_jobs(joblist, args) + logger.info("...jobs complete.") + def generate_joblist( comparisons: List, @@ -371,3 +376,43 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path fragpath = outdir / f"{inpath.stem}-fragments.fna" SeqIO.write(outseqs, fragpath, "fasta") return fragpath, sizedict + + +def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: + """Pass ANIblastall blastall jobs to the scheduler. + + :param joblist: list of ComparisonJob namedtuples + :param args: command-line arguments for the run + """ + logger = logging.getLogger(__name__) + logger.debug(f"Scheduler: {args.scheduler}") + + # Entries with None seen in recovery mode: + jobs = [_.job for _ in joblist if _.job] + + if args.scheduler == "multiprocessing": + logger.info("Running jobs with multiprocessing") + if not args.workers: + logger.debug("(using maximum number of worker threads)") + else: + logger.debug(f"(using {args.workers} worker threads, if available)") + cumval = run_mp.run_dependency_graph(jobs, workers=args.workers) + if cumval > 0: + logger.error( + "At least one blastall comparison failed. Please investigate (exiting)." + ) + raise PyaniException("Multiprocessing run failed in ANIblastall.") + logger.info("Multiprocessing run completed without error.") + elif args.scheduler.lower() == "sge": + logger.info("Running jobs with SGE") + logger.debug(f"Setting jobarray group size to {args.sgegroupsize}") + logger.debug(f"Joblist contains {len(joblist)}") + run_sge.run_dependency_graph( + jobs, + jgprefix=args.jobprefix, + sgegroupsize=args.sgegroupsize, + sgeargs=args.sgeargs, + ) + else: + logger.error(termcolor(f"Scheduler {args.scheduler} not recognised", "red")) + raise SystemError(1) From d53c4e844fa1444a878132603a51222c628daa91 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:09:40 +0100 Subject: [PATCH 55/75] Add code to update database --- .../scripts/subcommands/subcmd_aniblastall.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index e312afc0..0b6656c4 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -277,6 +277,16 @@ def subcmd_aniblastall(args: Namespace) -> None: run_aniblastall_jobs(joblist, args) logger.info("...jobs complete.") + # Process output and add results to database + # This requires us to drop out of threading/multiprocessing: Python's SQLite3 + # interface doesn't allow sharing connections and cursors + logger.info("Adding comparison results to database...") + update_comparison_results(joblist, run, session, blastall_version, fraglens, args) + update_comparison_matrices(session, run) + logger.info("...database updated.") + + # raise NotImplementedError + def generate_joblist( comparisons: List, @@ -416,3 +426,55 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: else: logger.error(termcolor(f"Scheduler {args.scheduler} not recognised", "red")) raise SystemError(1) + + +def update_comparison_results( + joblist: List[ComparisonJob], + run, + session, + blastall_version: str, + fraglens: Dict, + args: Namespace, +) -> None: + """Update the Comparison table with the completed result set. + + :param joblist: list of ComparisonJob namedtuples + :param run: Run ORM object for the current ANIblastall run + :param session: active pyanidb session via ORM + :param blastall_version: version of blastall used for the comparison + :param fraglens: dictionary of fragment lengths for each genome + :param args: command-line arguments for this run + + The Comparison table stores individual comparison results, one per row. + """ + logger = logging.getLogger(__name__) + + # Add individual results to Comparison table + for job in tqdm(joblist, disable=args.disable_tqdm): + logger.debug(f"\t{job.query.description} vs {job.subject.description}") + aln_length, sim_errs, ani_pid = aniblastall.parse_blast_tab( + job.outfile, fraglens + ) + logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}") + logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}") + qcov = aln_length / job.query.length + scov = aln_length / job.subject.length + run.comparisons.append( + Comparison( + query=job.query, + subject=job.subject, + aln_length=int(aln_length), + sim_errs=int(sim_errs), + identity=ani_pid, + cov_query=qcov, + cov_subject=scov, + program="blastall", + version=blastall_version, + fragsize=job.fragsize, + maxmatch=False, + ) + ) + + # Populate db + logger.debug("Committing results to database") + session.commit() From aba213c07e03d2609b649f59df56ce377ae2e344 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:12:53 +0100 Subject: [PATCH 56/75] Remove or rename things specific to `anib` to `aniblastall` --- tests/test_aniblastall.py | 67 ++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py index 5c5c6da6..c3645116 100644 --- a/tests/test_aniblastall.py +++ b/tests/test_aniblastall.py @@ -50,72 +50,65 @@ from pandas.util.testing import assert_frame_equal -from pyani import anib, pyani_files +from pyani import anib, pyani_files # probably don't need anib from pyani import aniblastall -class ANIbOutput(NamedTuple): +class ANIblastallOutput(NamedTuple): - """Convenience struct for ANIb output.""" + """Convenience struct for ANIblastall output.""" fragfile: Path - tabfile: Path legacytabfile: Path -class ANIbOutputDir(NamedTuple): +class ANIblastallOutputDir(NamedTuple): - """Convenience struct for ANIb output.""" + """Convenience struct for ANIblastall output.""" infiles: List[Path] fragfiles: List[Path] - blastdir: Path legacyblastdir: Path - blastresult: pd.DataFrame legacyblastresult: pd.DataFrame @pytest.fixture -def anib_output(dir_anib_in): - """Namedtuple of example ANIb output. +def aniblastall_output(dir_aniblastall_in): + """Namedtuple of example ANIblastall output. fragfile - fragmented FASTA query file - tabfile - BLAST+ tabular output legacytabfile - blastall tabular output """ - return ANIbOutput( - dir_anib_in / "NC_002696-fragments.fna", - dir_anib_in / "NC_002696_vs_NC_011916.blast_tab", - dir_anib_in / "NC_002696_vs_NC_010338.blast_tab", + return ANIblastallOutput( + dir_aniblastall_in / "NC_002696-fragments.fna", + dir_aniblastall_in / "NC_002696_vs_NC_010338.blast_tab", ) @pytest.fixture -def anib_output_dir(dir_anib_in): - """Namedtuple of example ANIb output - full directory. +def aniblastall_output_dir(dir_aniblastall_in): + """Namedtuple of example ANIblastall output - full directory. infiles - list of FASTA query files fragfiles - list of fragmented FASTA query files - blastdir - path to BLAST+ output data legacyblastdir - path to blastall output data - blastresult - pd.DataFrame result for BLAST+ legacyblastresult - pd.DataFrame result for blastall """ - return ANIbOutputDir( + return ANIblastallOutputDir( [ _ - for _ in (dir_anib_in / "sequences").iterdir() + for _ in (dir_aniblastall_in / "sequences").iterdir() if _.is_file() and _.suffix == ".fna" ], [ _ - for _ in (dir_anib_in / "fragfiles").iterdir() + for _ in (dir_aniblastall_in / "fragfiles").iterdir() if _.is_file() and _.suffix == ".fna" ], - dir_anib_in / "blastn", - dir_anib_in / "blastall", - pd.read_csv(dir_anib_in / "dataframes" / "blastn_result.csv", index_col=0), - pd.read_csv(dir_anib_in / "dataframes" / "blastall_result.csv", index_col=0), + dir_aniblastall_in / "blastall", + pd.read_csv( + dir_aniblastall_in / "dataframes" / "blastall_result.csv", index_col=0 + ), ) @@ -186,8 +179,8 @@ def test_blastall_graph(path_fna_all, tmp_path, fragment_length): def test_blastall_multiple(path_fna_two, tmp_path): - """Generate legacy BLASTN commands.""" - cmds = aniblastall.generate_blastn_commands(path_fna_two, tmp_path) + """Generate legacy BLASTALL commands.""" + cmds = aniblastall.generate_blastall_commands(path_fna_two, tmp_path) expected = [ ( "blastall -p blastn -o " @@ -208,7 +201,7 @@ def test_blastall_multiple(path_fna_two, tmp_path): def test_blastall_single(path_fna_two, tmp_path): - """Generate legacy BLASTN command-line.""" + """Generate legacy BLASTALL command-line.""" cmd = aniblastall.construct_blastall_cmdline( path_fna_two[0], path_fna_two[1], tmp_path ) @@ -246,23 +239,23 @@ def test_formatdb_single(path_fna, tmp_path): # Test output file parsing for ANIb methods -def test_parse_legacy_blastdir(anib_output_dir): +def test_parse_legacy_blastdir(aniblastall_output_dir): """Parses directory of legacy BLAST output.""" - orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles) - fraglengths = aniblastall.get_fraglength_dict(anib_output_dir.fragfiles) + orglengths = pyani_files.get_sequence_lengths(aniblastall_output_dir.infiles) + fraglengths = aniblastall.get_fraglength_dict(aniblastall_output_dir.fragfiles) result = aniblastall.process_blast( - anib_output_dir.legacyblastdir, orglengths, fraglengths + aniblastall_output_dir.legacyblastdir, orglengths, fraglengths ) assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), - anib_output_dir.legacyblastresult.sort_index(1).sort_index(), + aniblastall_output_dir.legacyblastresult.sort_index(1).sort_index(), ) -def test_parse_legacy_blasttab(anib_output): +def test_parse_legacy_blasttab(aniblastall_output): """Parses ANIB legacy .blast_tab output.""" - fragdata = aniblastall.get_fraglength_dict([anib_output.fragfile]) - result = aniblastall.parse_blast_tab(anib_output.legacytabfile, fragdata) + fragdata = aniblastall.get_fraglength_dict([aniblastall_output.fragfile]) + result = aniblastall.parse_blast_tab(aniblastall_output.legacytabfile, fragdata) assert ( a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018]) ) From 3233fd5c7db5f15389daa162434ac1468f2f4b6d Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:13:15 +0100 Subject: [PATCH 57/75] Skip unnecessary tests Will probably be removed --- tests/test_aniblastall.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py index c3645116..4ae714b0 100644 --- a/tests/test_aniblastall.py +++ b/tests/test_aniblastall.py @@ -178,6 +178,7 @@ def test_blastall_graph(path_fna_all, tmp_path, fragment_length): assert job.dependencies[0].script.startswith("formatdb") +@pytest.mark.skip(reason="unsure this is needed") def test_blastall_multiple(path_fna_two, tmp_path): """Generate legacy BLASTALL commands.""" cmds = aniblastall.generate_blastall_commands(path_fna_two, tmp_path) @@ -215,6 +216,7 @@ def test_blastall_single(path_fna_two, tmp_path): # Test legacy BLAST database formatting (formatdb) command generation +@pytest.mark.skip(reason="unsure this is needed") def test_formatdb_multiple(path_fna_two, tmp_path): """Generate legacy BLAST db creation commands.""" cmds = aniblastall.generate_blastdb_commands(path_fna_two, tmp_path) From 92932bae1f81f2530b41b6c8d6f6a54209088a61 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 22 Sep 2021 23:14:05 +0100 Subject: [PATCH 58/75] Add `dir_aniblastall_in()` fixture --- tests/conftest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 61b7c428..48bc4ccc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -117,6 +117,12 @@ def dir_anib_in(): return FIXTUREPATH / "anib" +@pytest.fixture +def dir_aniblastall_in(): + """Input files for ANIblastall tests.""" + return FIXTUREPATH / "anib" + + @pytest.fixture def dir_anim_in(): """Input files for ANIm tests.""" From 9631001d2f2662a9a1ef85f28ebcf064042a2183 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Thu, 30 Sep 2021 23:39:42 +0100 Subject: [PATCH 59/75] Add lines debugging May be removed later --- pyani/pyani_orm.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyani/pyani_orm.py b/pyani/pyani_orm.py index c2378fe4..0a59b0f0 100644 --- a/pyani/pyani_orm.py +++ b/pyani/pyani_orm.py @@ -44,6 +44,7 @@ from pathlib import Path from typing import Any, Dict, List, NamedTuple, Optional, Tuple +import logging import numpy as np # type: ignore import pandas as pd # type: ignore @@ -604,6 +605,8 @@ def update_comparison_matrices(session, run) -> None: :param session: active pyanidb session via ORM :param run: Run ORM object for the current ANIm run """ + logger = logging.getLogger(__name__) + # Create dataframes for storing in the Run table # Rows and columns are the (ordered) list of genome IDs genome_ids = sorted([_.genome_id for _ in run.genomes.all()]) @@ -623,6 +626,9 @@ def update_comparison_matrices(session, run) -> None: # Loop over all comparisons for the run and fill in result matrices for cmp in run.comparisons.all(): + logger.debug(f"Comparison: {cmp}") + logger.debug(f"Alignment length: {cmp.aln_length}") + # logger.debug(f"ANI alignment length: {cmp.ani_alnlen}") qid, sid = cmp.query_id, cmp.subject_id df_identity.loc[qid, sid] = cmp.identity df_identity.loc[sid, qid] = cmp.identity @@ -635,6 +641,7 @@ def update_comparison_matrices(session, run) -> None: df_hadamard.loc[qid, sid] = cmp.identity * cmp.cov_query df_hadamard.loc[sid, qid] = cmp.identity * cmp.cov_subject + logger.debug(f"Alnlength df: {df_alnlength.head()}") # Add matrices to the database run.df_identity = df_identity.to_json() run.df_coverage = df_coverage.to_json() From c7f60cc7422a81534cf9385e73e42b9310948011 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 5 Oct 2021 15:09:35 +0100 Subject: [PATCH 60/75] Fix naming of elements in `test_subcmd_anib" namespace --- tests/test_subcmd_08_anib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py index 68664c47..72664741 100644 --- a/tests/test_subcmd_08_anib.py +++ b/tests/test_subcmd_08_anib.py @@ -106,13 +106,13 @@ def setUp(self): outdir=self.dirpaths.outdir, dbpath=self.dbpath, force=False, - name="test_subcmd anib", + name="test_subcmd_anib", classes=self.lblfiles.classes, labels=self.lblfiles.labels, recovery=False, cmdline="ANIb test suite", blastn_exe=self.exes.blastn_exe, - filter_exe=self.exes.format_exe, + format_exe=self.exes.format_exe, fragsize=1020, scheduler=self.scheduler, workers=None, From c17413367e27ff718f6a16dc510f8e802957e447 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 5 Oct 2021 15:11:06 +0100 Subject: [PATCH 61/75] Update naming of `TestANIbSubcommand()` --- tests/test_subcmd_08_anib.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py index 72664741..cac3758b 100644 --- a/tests/test_subcmd_08_anib.py +++ b/tests/test_subcmd_08_anib.py @@ -77,8 +77,7 @@ LabelPaths = namedtuple("LabelPaths", "classes labels") -@pytest.mark.xfail(reason="ANIb is not currently fully implemented") -class TestANIbsubcommand(unittest.TestCase): +class TestANIbSubcommand(unittest.TestCase): """Class defining tests of the pyani anib subcommand.""" From dc89bcd91a6b3c5d4e3130249b26d5b7380c25db Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 5 Oct 2021 15:11:51 +0100 Subject: [PATCH 62/75] Switch to use `NamedTuple` --- tests/test_subcmd_08_anib.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py index cac3758b..17bbb0ed 100644 --- a/tests/test_subcmd_08_anib.py +++ b/tests/test_subcmd_08_anib.py @@ -59,7 +59,7 @@ import unittest from argparse import Namespace -from collections import namedtuple +from typing import NamedTuple from pathlib import Path import pytest @@ -68,13 +68,21 @@ # Convenience struct with paths to third-party executables -ThirdPartyExes = namedtuple("ThirdPartyExes", "blastn_exe format_exe") +class ThirdPartyExes(NamedTuple): + blastall_exe: Path + format_exe: Path + # Convenience struct with paths to working directories -DirPaths = namedtuple("DirPaths", "indir outdir") +class DirPaths(NamedTuple): + indir: Path + outdir: Path + # Convenience struct for label/class files -LabelPaths = namedtuple("LabelPaths", "classes labels") +class LabelPaths(NamedTuple): + classes: Path + labels: Path class TestANIbSubcommand(unittest.TestCase): From 13add41314c3c07cc575fd806f6e4193c7c7061a Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 5 Oct 2021 15:13:18 +0100 Subject: [PATCH 63/75] Add `test_subcmd_10_aniblastall.py` --- tests/test_subcmd_10_aniblastall.py | 94 +++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 tests/test_subcmd_10_aniblastall.py diff --git a/tests/test_subcmd_10_aniblastall.py b/tests/test_subcmd_10_aniblastall.py new file mode 100644 index 00000000..a282880b --- /dev/null +++ b/tests/test_subcmd_10_aniblastall.py @@ -0,0 +1,94 @@ +"""Test aniblastall subcommand for pyani. + +The test suite is intended to be run from the repository root using: + +pytest -v + +Each command CMD available at the command line as pyani is +tested in its own class as a subclass of unittest.TestCase, where +setUp() defines input/output files, a null logger (which is also +picked up by nosetests), and a dictionary of command lines, keyed +by test name, with values representing command-line options. + +For each test, command line options are defined in a Namespace and +passed as the sole argument to the appropriate subcommand. +""" + +import logging +import os +import unittest + +from argparse import Namespace +from typing import NamedTuple +from pathlib import Path + +import pytest + +from pyani.scripts import subcommands + + +# Convenience struct with paths to third-party executables +class ThirdPartyExes(NamedTuple): + blastall_exe: Path + format_exe: Path + + +# Convenience struct with paths to working directories +class DirPaths(NamedTuple): + indir: Path + outdir: Path + + +# Convenience struct for label/class files +class LabelPaths(NamedTuple): + classes: Path + labels: Path + + +class TestANIblastallSubcommand(unittest.TestCase): + + """Class defining tests of the pyani aniblastall subcommand.""" + + def setUp(self): + """Configure parameters for tests.""" + self.dirpaths = DirPaths( + Path("tests/test_input/subcmd_anib"), + Path("tests/test_output/subcmd_aniblastall"), + ) + os.makedirs(self.dirpaths.outdir, exist_ok=True) + self.dbpath = Path("tests/test_output/subcmd_createdb/pyanidb") + self.lblfiles = LabelPaths( + self.dirpaths.indir / "classes.txt", self.dirpaths.indir / "labels.txt" + ) + self.exes = ThirdPartyExes("blastall", "formatdb") + self.scheduler = "multiprocessing" + + # Null logger instance + self.logger = logging.getLogger("TestIndexSubcommand logger") + self.logger.addHandler(logging.NullHandler()) + + # Command line namespaces + self.argsdict = { + "aniblastall": Namespace( + indir=self.dirpaths.indir, + outdir=self.dirpaths.outdir, + dbpath=self.dbpath, + force=False, + name="test_subcmd_aniblastall", + classes=self.lblfiles.classes, + labels=self.lblfiles.labels, + recovery=False, + cmdline="ANIblastall test suite", + blastall_exe=self.exes.blastall_exe, + formatdb_exe=self.exes.format_exe, + fragsize=1020, + scheduler=self.scheduler, + workers=None, + disable_tqdm=True, + jobprefix="ANIblastallTest", + ) + } + + def test_anib(self): + """Test aniblastall run.""" + subcommands.subcmd_aniblastall(self.argsdict["aniblastall"]) From 7566329a30c91b36dbbbfd73b5fee56a7120d7dd Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 5 Oct 2021 15:15:40 +0100 Subject: [PATCH 64/75] Make `test_subcmd_04_anim.py` match other method subcommand tests --- tests/test_subcmd_04_anim.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/test_subcmd_04_anim.py b/tests/test_subcmd_04_anim.py index 7387bdae..e514523f 100644 --- a/tests/test_subcmd_04_anim.py +++ b/tests/test_subcmd_04_anim.py @@ -48,20 +48,28 @@ import unittest from argparse import Namespace -from collections import namedtuple +from typing import NamedTuple from pathlib import Path from pyani.scripts import subcommands # Convenience struct with paths to third-party executables -ThirdPartyExes = namedtuple("ThirdPartyExes", "nucmer_exe filter_exe") +class ThirdPartyExes(NamedTuple): + nucmer_exe: Path + format_exe: Path + # Convenience struct with paths to working directories -DirPaths = namedtuple("DirPaths", "indir outdir") +class DirPaths(NamedTuple): + indir: Path + outdir: Path + # Convenience struct for label/class files -LabelPaths = namedtuple("LabelPaths", "classes labels") +class LabelPaths(NamedTuple): + classes: Path + labels: Path class TestANImSubcommand(unittest.TestCase): @@ -94,7 +102,7 @@ def setUp(self): outdir=self.dirpaths.outdir, dbpath=self.dbpath, force=False, - name="test_anim", + name="test_subcmd_anim", classes=self.lblfiles.classes, labels=self.lblfiles.labels, recovery=False, From 3c688d407e95582dacf220e0a966079ce35339d0 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Tue, 5 Oct 2021 15:33:33 +0100 Subject: [PATCH 65/75] Fix names of executable variables --- tests/test_subcmd_04_anim.py | 2 +- tests/test_subcmd_08_anib.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_subcmd_04_anim.py b/tests/test_subcmd_04_anim.py index e514523f..8863d419 100644 --- a/tests/test_subcmd_04_anim.py +++ b/tests/test_subcmd_04_anim.py @@ -57,7 +57,7 @@ # Convenience struct with paths to third-party executables class ThirdPartyExes(NamedTuple): nucmer_exe: Path - format_exe: Path + filter_exe: Path # Convenience struct with paths to working directories diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py index 17bbb0ed..5f9dce5e 100644 --- a/tests/test_subcmd_08_anib.py +++ b/tests/test_subcmd_08_anib.py @@ -69,7 +69,7 @@ # Convenience struct with paths to third-party executables class ThirdPartyExes(NamedTuple): - blastall_exe: Path + blastn_exe: Path format_exe: Path From bc25ccc9348c1760f603a42e19950151147f1523 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Thu, 7 Oct 2021 11:59:39 +0100 Subject: [PATCH 66/75] Pass both stdout and stderr to the regex version search --- pyani/anib.py | 2 +- pyani/aniblastall.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyani/anib.py b/pyani/anib.py index 43101ebc..e711cbe5 100644 --- a/pyani/anib.py +++ b/pyani/anib.py @@ -143,7 +143,7 @@ def get_version(blast_exe: Path = pyani_config.BLASTN_DEFAULT) -> str: check=True, ) version = re.search( # type: ignore - r"(?<=blastn:\s)[0-9\.]*\+", str(result.stdout, "utf-8") + r"(?<=blastn:\s)[0-9\.]*\+", str(result.stdout + result.stderr, "utf-8") ).group() if 0 == len(version.strip()): diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py index 6e3f4899..c6349a78 100644 --- a/pyani/aniblastall.py +++ b/pyani/aniblastall.py @@ -111,7 +111,7 @@ def get_version(blastall_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str: return f"blastall exists at {blastall_path} but could not be executed" version = re.search( # type: ignore - r"(?<=blastall\s)[0-9\.]*", str(result.stderr, "utf-8") + r"(?<=blastall\s)[0-9\.]*", str(result.stderr + result.stdout, "utf-8") ).group() if 0 == len(version.strip()): From a313d2f2bce97df9894761ed8071033260af0bff Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Thu, 7 Oct 2021 12:48:25 +0100 Subject: [PATCH 67/75] Set `indir` and `outdir` to required in `anib_parser.py` --- pyani/scripts/parsers/anib_parser.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyani/scripts/parsers/anib_parser.py b/pyani/scripts/parsers/anib_parser.py index b2b99785..61e0b53c 100644 --- a/pyani/scripts/parsers/anib_parser.py +++ b/pyani/scripts/parsers/anib_parser.py @@ -68,7 +68,8 @@ def build( "--indir", action="store", dest="indir", - default=None, + # default=None, + required=True, type=Path, help="input genome directory", ) @@ -77,7 +78,8 @@ def build( "--outdir", action="store", dest="outdir", - default=None, + # default=None, + required=True, type=Path, help="output analysis results directory", ) From 3ccd1d9cba39cbd9f02c87ba7e680c944f0039f0 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Thu, 7 Oct 2021 13:16:52 +0100 Subject: [PATCH 68/75] Add `docs/subcmd_anib.rst` --- docs/subcmd_anib.rst | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/docs/subcmd_anib.rst b/docs/subcmd_anib.rst index 25b6a9c2..82e18f77 100644 --- a/docs/subcmd_anib.rst +++ b/docs/subcmd_anib.rst @@ -3,3 +3,78 @@ ============== ``pyani anib`` ============== + +The ``anib`` subcommand will carry out ANIb analysis using genome files contained in the ``indir`` directory, writing result files to the ``outdir`` directory, and recording data about each comparison and run in a local `SQLite3`_ database. + +.. code-block:: text + + usage: pyani.py anib [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm] [--version] + [--citation] [--scheduler {multiprocessing,SGE}] + [--workers WORKERS] [--SGEgroupsize SGEGROUPSIZE] + [--SGEargs SGEARGS] [--jobprefix JOBPREFIX] [--name NAME] + [--classes CLASSES] [--labels LABELS] [--recovery] -i INDIR -o + OUTDIR [--dbpath DBPATH] [--blastn_exe BLASTN_EXE] + [--format_exe FORMAT_EXE] [--fragsize FRAGSIZE] + +.. _SQLite3: https://www.sqlite.org/index.html + +----------------- +Flagged arguments +----------------- + +``--blastn_exe BLASTN_EXE`` + Path to the ``blastn`` executable. Default: ``blastn`` + +``--classes CLASSFNAME`` + Use the set of classes (one per genome sequence file) found in the file ``CLASSFNAME`` in ``INDIR``. Default: ``classes.txt`` + +``--dbpath DBPATH`` + Path to the location of the local ``pyani`` database to be used. Default: ``.pyani/pyanidb`` + +``--disable_tqdm`` + Disable the ``tqdm`` progress bar while the ANIb process runs. This is useful when testing to avoid aesthetic problems with test output. + +``--format_exe FORMAT_EXE`` + Path to the ``blastn`` executable. Default: ``makeblastdb`` + +``--fragsize FRAGSIZE`` + Fragment size to use in analysis. (default: 1020) + +``-h, --help`` + Display usage information for ``pyani anib``. + +``-i INDIR, --input INDIR`` + Path to the directory containing indexed genome files to be used for the analysis. + +``--jobprefix JOBPREFIX`` + Use the string ``JOBPREFIX`` as a prefix for SGE job submission names. Default: ``PYANI`` + +``--labels LABELFNAME`` + Use the set of labels (one per genome sequence file) found in the file ``LABELFNAME`` in ``INDIR``. Default: ``labels.txt`` + +``-l LOGFILE, --logfile LOGFILE`` + Provide the location ``LOGFILE`` to which a logfile of the ANIb process will be written. + +``--name NAME`` + Use the string ``NAME`` to identify this ``ANIb`` run in the ``pyani`` database. + +``-o OUTDIR, --outdir OUTDIR`` + Path to a directory where comparison output files will be written. + +``--recovery`` + Use existing ``ANIb`` comparison output if available, e.g. if recovering from a failed job submission. Using this option will not generate a new comparison if the old output files exist. + +``--scheduler {multiprocessing, SGE}`` + Specify the job scheduler to be used when parallelising genome comparisons: one of ``multiprocessing`` (use many cores on the current machine) or ``SGE`` (use an SGE or OGE job scheduler). Default: ``multiprocessing``. + +``--SGEargs SGEARGS`` + Pass additional arguments ``SGEARGS`` to ``qsub`` when running the SGE-distributed jobs. + +``--SGEgroupsize SGEGROUPSIZE`` + Create SGE arrays containing SGEGROUPSIZE comparison jobs. Default: 10000 + +``-v, --verbose`` + Provide verbose output to ``STDOUT``. + +``--workers WORKERS`` + Spawn WORKERS worker processes with the ``--scheduler multiprocessing`` option. Default: 0 (use all cores) From f423dbe3ce48c8cbc1e26ebe0a097980be8735f8 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Thu, 7 Oct 2021 13:17:39 +0100 Subject: [PATCH 69/75] Add documentation for `aniblastall` --- docs/run_aniblastall.rst | 121 ++++++++++++++++++++++++++++++++++++ docs/subcmd_aniblastall.rst | 80 ++++++++++++++++++++++++ docs/subcommands.rst | 1 + 3 files changed, 202 insertions(+) create mode 100644 docs/run_aniblastall.rst create mode 100644 docs/subcmd_aniblastall.rst diff --git a/docs/run_aniblastall.rst b/docs/run_aniblastall.rst new file mode 100644 index 00000000..cb5884d7 --- /dev/null +++ b/docs/run_aniblastall.rst @@ -0,0 +1,121 @@ +.. _pyani-run_aniblastall: + +===================== +Running ANIblastall analysis +===================== + +``pyani`` implements average nucleotide identity analysis using `NCBI-BLAST`_ (*ANIblastall*) as defined in Goris `et al.` (2007) (`doi:10.1099/ijs.0.64483-0`_). To run ANIblastall on a set of input genomes, use the ``pyani aniblastall`` subcommand. + +In brief, the analysis proceeds as follows for a set of input prokaryotic genomes: + +1. Each input genome is fragmented into consecutive sequences of a given size (default: 1020bp) +2. A new ``BLAST`` database is built from each input genome sequence +3. `NCBI-BLAST`_ is used to perform pairwise comparisons of each input genome fragment set against the databases for each other input genome, to identify homologous (alignable) regions. +4. For each comparison, the alignment output is parsed, and the following values are calculated: + + - total number of aligned bases on each genome + - fraction of each genome that is aligned (the *coverage*) + - the proportion of all aligned regions that is identical in each genome (the *ANI*) + - the number of unaligned or non-identical bases (the *similarity errors*) + - the product of *coverage* and *ANI* + +The output values are recorded in the ``pyani`` database. + +.. NOTE:: + The `NCBI-BLAST`_ comparisons are asymmetric, and performed in both directions for a pair of genomes (i.e. "fragmented A vs complete B" and "fragmented B vs complete A"). + +.. TIP:: + The `NCBI-BLAST`_ comparisons are embarrasingly parallel, and can be distributed across cores on an `Open Grid Scheduler`_-compatible cluster, using the ``--scheduler SGE`` option. + +.. ATTENTION:: + ``pyani aniblastall`` requires that a working copy of `NCBI-BLAST`_ is available. Please see :ref:`pyani-installation` for information about installing this package. + +For more information about the ``pyani aniblastall`` subcommand, please see the :ref:`pyani-subcmd-aniblastall` page, or issue the command ``pyani aniblastall -h`` to see the inline help. + +--------------------- +Perform ANIblastall analysis +--------------------- + +The basic form of the command is: + +.. code-block:: bash + + pyani aniblastall -i -o + +This instructs ``pyani`` to perform ANIblastall on the genome FASTA files in ````, and write any output files to ````. For example, the following command performs ANIblastall on genomes in the directory ``genomes`` and writes output to a new directory ``genomes_ANIblastall``: + +.. code-block:: bash + + pyani aniblastall -i genomes -o genomes_ANIblastall + +.. NOTE:: + While running, ``pyani aniblastall`` will show progress bars unless these are disabled with the option ``--disable_tqdm`` + +This command will write the intermediate `NCBI-BLAST`_ output to the directory ``genomes_ANIblastall``, where the results can be inspected if required. + +.. + I am unsure if this is relevant for aniblastall + .. code-block:: bash + + $ ls genomes_ANIblastall/ + nucmer_output + +.. ATTENTION:: + To view the output ANIblastall results, you will need to use the ``pyani report`` or ``pyani plot`` subcommands. Please see :ref:`pyani-subcmd-report` and :ref:`pyani-subcmd-plot` for more details. + +---------------------------------------------- +Perform ANIblastall analysis with Open Grid Scheduler +---------------------------------------------- + +The `NCBI-BLAST`_ comparisons are embarrasingly parallel, and these jobs can be distributed across cores in a cluster using the `Open Grid Scheduler`_. To enable this during the analysis, use the ``--scheduler SGE`` option: + +.. code-block:: bash + + pyani aniblastall --scheduler SGE -i genomes -o genomes_ANIblastall + +.. NOTE:: + Jobs are submitted as *array jobs* to keep the scheduler queue short. + +.. NOTE:: + If ``--scheduler SGE`` is not specified, all `NCBI-BLAST`_ jobs are run locally with ``Python``'s ``multiprocessing`` module. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Controlling parameters of Open Grid Scheduler +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is possible to control the following features of `Open Grid Scheduler`_ `via` the ``pyani aniblastall`` subcommand: + +- The array job size (by default, comparison jobs are batched in arrays of 10,000) +- The prefix string for the job, as reported in the scheduler queue +- Arguments to the ``qsub`` job submission command + +These allow for useful control of job execution. For example, the command: + +.. code-block:: bash + + pyani aniblastall --scheduler SGE --SGEgroupsize 5000 -i genomes -o genomes_ANIblastall + +will batch ``ANIblastall`` jobs in groups of 500 for the scheduler. The command: + +.. code-block:: bash + + pyani aniblastall --scheduler SGE --jobprefix My_Ace_Job -i genomes -o genomes_ANIblastall + +will prepend the string ``My_Ace_Job`` to your job in the scheduler queue. And the command: + +.. code-block:: bash + + pyani aniblastall --scheduler SGE --SGEargs "-m e -M my.name@my.domain" --SGEgroupsize 5000 -i genomes -o genomes_ANIblastall + +will email ``my.name@my.domain`` when the jobs finish. + + +---------- +References +---------- + +- Goris`et al.` (2007) `Int J Syst Evol Micr` _57_: 81-91. `doi:10.1099/ijs.0.64483-0`. + +.. _doi:10.1099/ijs.0.64483-0: https://dx.doi.org/10.1099/ijs.0.64483-0 +.. _NCBI-BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download +.. _Open Grid Scheduler: http://gridscheduler.sourceforge.net/ diff --git a/docs/subcmd_aniblastall.rst b/docs/subcmd_aniblastall.rst new file mode 100644 index 00000000..c4f472c7 --- /dev/null +++ b/docs/subcmd_aniblastall.rst @@ -0,0 +1,80 @@ +.. _pyani-subcmd-aniblastall: + +============== +``pyani aniblastall`` +============== + +The ``aniblastall`` subcommand will carry out ANIb analysis using genome files contained in the ``indir`` directory, writing result files to the ``outdir`` directory, and recording data about each comparison and run in a local `SQLite3`_ database. + +.. code-block:: text + + usage: pyani.py aniblastall [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm] [--version] + [--citation] [--scheduler {multiprocessing,SGE}] + [--workers WORKERS] [--SGEgroupsize SGEGROUPSIZE] + [--SGEargs SGEARGS] [--jobprefix JOBPREFIX] [--name NAME] + [--classes CLASSES] [--labels LABELS] [--recovery] -i INDIR -o + OUTDIR [--dbpath DBPATH] [--blastall_exe BLASTALL_EXE] + [--format_exe FORMAT_EXE] [--fragsize FRAGSIZE] + +.. _SQLite3: https://www.sqlite.org/index.html + +----------------- +Flagged arguments +----------------- + +``--blastall_exe BLASTALL_EXE`` + Path to the ``blastall`` executable. Default: ``blastall`` + +``--classes CLASSFNAME`` + Use the set of classes (one per genome sequence file) found in the file ``CLASSFNAME`` in ``INDIR``. Default: ``classes.txt`` + +``--dbpath DBPATH`` + Path to the location of the local ``pyani`` database to be used. Default: ``.pyani/pyanidb`` + +``--disable_tqdm`` + Disable the ``tqdm`` progress bar while the ANIblastall process runs. This is useful when testing to avoid aesthetic problems with test output. + +``--format_exe FORMAT_EXE`` + Path to the ``blastall`` executable. Default: ``formatdb`` + +``--fragsize FRAGSIZE`` + Fragment size to use in analysis. (default: 1020) + +``-h, --help`` + Display usage information for ``pyani aniblastall``. + +``-i INDIR, --input INDIR`` + Path to the directory containing indexed genome files to be used for the analysis. + +``--jobprefix JOBPREFIX`` + Use the string ``JOBPREFIX`` as a prefix for SGE job submission names. Default: ``PYANI`` + +``--labels LABELFNAME`` + Use the set of labels (one per genome sequence file) found in the file ``LABELFNAME`` in ``INDIR``. Default: ``labels.txt`` + +``-l LOGFILE, --logfile LOGFILE`` + Provide the location ``LOGFILE`` to which a logfile of the ANIb process will be written. + +``--name NAME`` + Use the string ``NAME`` to identify this ``ANIblastall`` run in the ``pyani`` database. + +``-o OUTDIR, --outdir OUTDIR`` + Path to a directory where comparison output files will be written. + +``--recovery`` + Use existing ``ANIblastall`` comparison output if available, e.g. if recovering from a failed job submission. Using this option will not generate a new comparison if the old output files exist. + +``--scheduler {multiprocessing, SGE}`` + Specify the job scheduler to be used when parallelising genome comparisons: one of ``multiprocessing`` (use many cores on the current machine) or ``SGE`` (use an SGE or OGE job scheduler). Default: ``multiprocessing``. + +``--SGEargs SGEARGS`` + Pass additional arguments ``SGEARGS`` to ``qsub`` when running the SGE-distributed jobs. + +``--SGEgroupsize SGEGROUPSIZE`` + Create SGE arrays containing SGEGROUPSIZE comparison jobs. Default: 10000 + +``-v, --verbose`` + Provide verbose output to ``STDOUT``. + +``--workers WORKERS`` + Spawn WORKERS worker processes with the ``--scheduler multiprocessing`` option. Default: 0 (use all cores) diff --git a/docs/subcommands.rst b/docs/subcommands.rst index fa29f872..e169383a 100644 --- a/docs/subcommands.rst +++ b/docs/subcommands.rst @@ -22,6 +22,7 @@ This document links out to detailed instructions for each of the ``pyani`` subco subcmd_createdb subcmd_anim subcmd_anib + subcmd_aniblastall subcmd_report subcmd_plot subcmd_classify From 8c40113634b509c4697ccbc615b180dc45b11b87 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Sun, 12 Dec 2021 21:06:44 +0000 Subject: [PATCH 70/75] Replace f-strings in logging statements --- pyani/scripts/subcommands/subcmd_anib.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py index 991aa765..2d8f3fd4 100644 --- a/pyani/scripts/subcommands/subcmd_anib.py +++ b/pyani/scripts/subcommands/subcmd_anib.py @@ -179,7 +179,7 @@ def subcmd_anib(args: Namespace) -> None: os.makedirs(args.outdir, exist_ok=True) except IOError: logger.error( - f"Could not create output directory {args.outdir} (exiting)", exc_info=True + "Could not create output directory %s (exiting)", args.outdir, exc_info=True ) raise SystemError(1) fragdir = Path(str(args.outdir)) / "fragments" @@ -231,7 +231,7 @@ def subcmd_anib(args: Namespace) -> None: session, run, comparisons, "blastn", blastn_version, args.fragsize, False ) logger.info( - f"\t...after check, still need to run {len(comparisons_to_run)} comparisons" + "\t...after check, still need to run %s comparisons", len(comparisons_to_run) ) # If there are no comparisons to run, update the Run matrices and exit @@ -291,7 +291,7 @@ def subcmd_anib(args: Namespace) -> None: comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args ) logger.debug("...created %s blastn jobs", len(joblist)) - # print(f"Joblist: {joblist}") + # Pass jobs to appropriate scheduler logger.debug("Passing %s jobs to %s...", len(joblist), args.scheduler) run_anib_jobs(joblist, args) @@ -305,8 +305,6 @@ def subcmd_anib(args: Namespace) -> None: update_comparison_matrices(session, run) logger.info("...database updated.") - # raise NotImplementedError - def generate_joblist( comparisons: List, @@ -472,8 +470,7 @@ def update_comparison_results( for job in tqdm(joblist, disable=args.disable_tqdm): logger.debug("\t%s vs %s", job.query.description, job.subject.description) aln_length, sim_errs, ani_pid = anib.parse_blast_tab(job.outfile, fraglens) - logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}") - logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}") + qcov = aln_length / job.query.length scov = aln_length / job.subject.length run.comparisons.append( From 0596e7776995a8c3be9a3033811f8d2399211067 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 13 Apr 2022 16:28:08 +0100 Subject: [PATCH 71/75] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1bb86dd5..4c2608e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Scratch directory for local testing scratch/ +makefile_template # Mac-related dreck .DS_Store @@ -72,4 +73,4 @@ venv-* # Extra documentation output classes_pyani.pdf -packages_pyani.pdf \ No newline at end of file +packages_pyani.pdf From 6b7b52f38d92eb0bac4258d8ffbf550a62340359 Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 13 Apr 2022 17:35:13 +0100 Subject: [PATCH 72/75] Update call to `add_run()` to fit the function's new return value --- pyani/scripts/subcommands/subcmd_aniblastall.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index 0b6656c4..c3e4e620 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -132,7 +132,7 @@ def subcmd_aniblastall(args: Namespace) -> None: # Add information about this run to the database logger.debug(f"Adding run info to database {args.dbpath}...") try: - run = add_run( + run, run_id = add_run( session, method="ANIblastall", cmdline=args.cmdline, @@ -143,20 +143,20 @@ def subcmd_aniblastall(args: Namespace) -> None: except PyaniORMException: logger.error("Could not add run to the database (exiting)", exc_info=True) raise SystemExit(1) - logger.debug(f"\t...added run ID: {run} to the database") + logger.debug(f"\t...added run ID: {run_id} to the database") # Identify input files for comparison, and populate the database - logger.debug(f"Adding files for {run} to database...") + logger.debug(f"Adding files for run {run_id} to database...") try: genome_ids = add_run_genomes( session, run, args.indir, args.classes, args.labels ) except PyaniORMException: logger.error( - f"Could not add genomes to database for run {run} (exiting)", exc_info=True + "Could not add genomes to database for run (exiting)", exc_info=True ) raise SystemExit(1) - logger.debug(f"\t...added gnome IDs: {genome_ids}") + logger.debug(f"\t...added genome IDs for run {run_id}: {genome_ids}") # Get list of genomes for this anlaysis from the database logger.info("Compiling genomes for comparison") From 23d951fafd839448494413b20c3b1a7492d5eacb Mon Sep 17 00:00:00 2001 From: baileythegreen Date: Wed, 13 Apr 2022 17:36:04 +0100 Subject: [PATCH 73/75] Remove f-strings from logging calls --- .../scripts/subcommands/subcmd_aniblastall.py | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py index c3e4e620..68d2a0d2 100644 --- a/pyani/scripts/subcommands/subcmd_aniblastall.py +++ b/pyani/scripts/subcommands/subcmd_aniblastall.py @@ -112,25 +112,25 @@ def subcmd_aniblastall(args: Namespace) -> None: # Get BLASTALL version - this will be used in the database entreis blastall_version = aniblastall.get_version(args.blastall_exe) - logger.info(termcolor(f"BLAST+ blastall version: {blastall_version}", "cyan")) + logger.info(termcolor("BLAST+ blastall version: %s", blastall_version, "cyan")) # Use provided name, or make new one for this analysis start_time = datetime.datetime.now() name = args.name or "_".join(["ANIblastall", start_time.isoformat()]) - logger.info(termcolor(f"Analysis name: {name}", "cyan")) + logger.info(termcolor("Analysis name: %s", name, "cyan")) # Connect to existing database (which may be "clean" or have old analyses) - logger.debug(f"Connecting to database {args.dbpath}") + logger.debug("Connecting to database %s", args.dbpath) try: session = get_session(args.dbpath) except Exception: logger.error( - f"Could not connect to database {args.dbpath} (exiting)", exc_info=True + "Could not connect to database %s (exiting)", args.dbpath, exc_info=True ) raise SystemExit(1) # Add information about this run to the database - logger.debug(f"Adding run info to database {args.dbpath}...") + logger.debug("Adding run info to database %s...", args.dbpath) try: run, run_id = add_run( session, @@ -143,10 +143,10 @@ def subcmd_aniblastall(args: Namespace) -> None: except PyaniORMException: logger.error("Could not add run to the database (exiting)", exc_info=True) raise SystemExit(1) - logger.debug(f"\t...added run ID: {run_id} to the database") + logger.debug("\t...added run ID: %s to the database", run_id) # Identify input files for comparison, and populate the database - logger.debug(f"Adding files for run {run_id} to database...") + logger.debug("Adding files for run %s to database...", run_id) try: genome_ids = add_run_genomes( session, run, args.indir, args.classes, args.labels @@ -156,21 +156,21 @@ def subcmd_aniblastall(args: Namespace) -> None: "Could not add genomes to database for run (exiting)", exc_info=True ) raise SystemExit(1) - logger.debug(f"\t...added genome IDs for run {run_id}: {genome_ids}") + logger.debug("\t...added genome IDs for run %s: %s", run_id, genome_ids) # Get list of genomes for this anlaysis from the database logger.info("Compiling genomes for comparison") genomes = run.genomes.all() - logger.debug(f"\tCollected {len(genomes)} genomes for this run") + logger.debug("\tCollected %s genomes for this run", len(genomes)) # Create output directories. We create the amin parent directory (args.outdir), but # also subdirectories for the BLAST databases. - logger.debug(f"Creating output directory {args.outdir}") + logger.debug("Creating output directory %s", args.outdir) try: os.makedirs(args.outdir, exist_ok=True) except IOError: logger.error( - f"Could not create output directory {args.outdir} (exiting)", exc_info=True + "Could not create output directory %s (exiting)", args.outdir, exc_info=True ) raise SystemError(1) fragdir = Path(str(args.outdir)) / "fragments" @@ -188,7 +188,6 @@ def subcmd_aniblastall(args: Namespace) -> None: fragpath, fragsizes = fragment_fasta_file( Path(str(genome.path)), Path(str(fragdir)), args.fragsize ) - logger.info(f"fragsizes: {type(fragsizes)}") fragfiles.update({Path(genome.path).stem: fragpath}) fraglens.update({Path(genome.path).stem: fragsizes}) @@ -216,7 +215,7 @@ def subcmd_aniblastall(args: Namespace) -> None: "Compiling pairwise comparisons (this can take time for large datasets)..." ) comparisons = list(permutations(tqdm(genomes, disable=args.disable_tqdm), 2)) - logger.info(f"\t...total pairwise comparisons to be performed: {len(comparisons)}") + logger.info("\t...total pairwise comparisons to be performed: %s", len(comparisons)) # Check for existing comparisons; if one has already been done (for the same # software package, version, and setting) we add the comparison to this run, @@ -226,7 +225,7 @@ def subcmd_aniblastall(args: Namespace) -> None: session, run, comparisons, "blastall", blastall_version, args.fragsize, False ) logger.info( - f"\t...after check, still need to run {len(comparisons_to_run)} comparisons" + "\t...after check, still need to run %s comparisons", len(comparisons_to_run) ) # If there are no comparisons to run, update the Run matrices and exit @@ -250,12 +249,13 @@ def subcmd_aniblastall(args: Namespace) -> None: if args.recovery: logger.warning("Entering recovery mode...") logger.debug( - f"\tIn this mode, existing comparison output from {args.outdir} is reused" + "\tIn this mode, existing comparison output from %s is reused", args.outdir ) existing_files = collect_existing_output(args.outdir, "blastall", args) if existing_files: logger.debug( - f"\tIdentified {len(existing_files)} existing output files for reuse, existing_files[0] (et cetera)" + "\tIdentified %s existing output files for reuse, existing_files[0] (et cetera)", + len(existing_files), ) else: logger.debug("\tIdentified no existing output files") @@ -270,10 +270,10 @@ def subcmd_aniblastall(args: Namespace) -> None: joblist = generate_joblist( comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args ) - logger.debug(f"...created {len(joblist)} blastall jobs") + logger.debug("...created %s blastall jobs", len(joblist)) # Pass jobs to appropriate scheduler - logger.debug(f"Passing {len(joblist)} jobs to {args.scheduler}") + logger.debug("Passing %s jobs to %s", len(joblist), args.scheduler) run_aniblastall_jobs(joblist, args) logger.info("...jobs complete.") @@ -320,10 +320,10 @@ def generate_joblist( blastallcmd = aniblastall.generate_blastall_commands( qfrags, Path(subject.path), args.outdir, args.blastall_exe ) - logger.debug(f"Commands to run:\n\t{blastallcmd}\n") + logger.debug("Commands to run:\n\t%s\n", blastallcmd) outprefix = blastallcmd.split()[4][:-10] # prefix for blastall output outfname = Path(outprefix + ".blast_tab") - logger.debug(f"Expected output file for db: {outfname}") + logger.debug("Expected output file for db: %s", outfname) # If we are in recovery mode, we are salvaging output from a previous # run, and do not necessarily need to rerun all the jobs. In this case, @@ -334,7 +334,7 @@ def generate_joblist( # added to the database whether they come from recovery mode or are run # in this call of the script. if args.recovery and outfname in existing_files: - logger.debug(f"Recovering output from {outfname}, not submitting job") + logger.debug("Recovering output from %s, not submitting job", outfname) # Need to track the expected output, but set the job itself to None. joblist.append( ComparisonJob( @@ -395,7 +395,7 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: :param args: command-line arguments for the run """ logger = logging.getLogger(__name__) - logger.debug(f"Scheduler: {args.scheduler}") + logger.debug("Scheduler: %s", args.scheduler) # Entries with None seen in recovery mode: jobs = [_.job for _ in joblist if _.job] @@ -405,7 +405,7 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: if not args.workers: logger.debug("(using maximum number of worker threads)") else: - logger.debug(f"(using {args.workers} worker threads, if available)") + logger.debug("(using %s worker threads, if available)", args.workers) cumval = run_mp.run_dependency_graph(jobs, workers=args.workers) if cumval > 0: logger.error( @@ -415,8 +415,8 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: logger.info("Multiprocessing run completed without error.") elif args.scheduler.lower() == "sge": logger.info("Running jobs with SGE") - logger.debug(f"Setting jobarray group size to {args.sgegroupsize}") - logger.debug(f"Joblist contains {len(joblist)}") + logger.debug("Setting jobarray group size to %s", args.sgegroupsize) + logger.debug("Joblist contains %s", len(joblist)) run_sge.run_dependency_graph( jobs, jgprefix=args.jobprefix, @@ -424,7 +424,7 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: sgeargs=args.sgeargs, ) else: - logger.error(termcolor(f"Scheduler {args.scheduler} not recognised", "red")) + logger.error(termcolor("Scheduler %s not recognised", args.scheduler, "red")) raise SystemError(1) @@ -451,12 +451,10 @@ def update_comparison_results( # Add individual results to Comparison table for job in tqdm(joblist, disable=args.disable_tqdm): - logger.debug(f"\t{job.query.description} vs {job.subject.description}") + logger.debug("\t%s vs %s", job.query.description, job.subject.description) aln_length, sim_errs, ani_pid = aniblastall.parse_blast_tab( job.outfile, fraglens ) - logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}") - logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}") qcov = aln_length / job.query.length scov = aln_length / job.subject.length run.comparisons.append( From ffba9022163894c9120b4b15919e42c802c1e332 Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Tue, 7 Jun 2022 18:14:56 +0100 Subject: [PATCH 74/75] update deprecated behaviours in pandas - the sort_index() call now needs axis as an explicit keyword - pandas.utils.testing is now pandas.testing --- tests/test_aniblastall.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py index 5a8a0794..b04beb71 100644 --- a/tests/test_aniblastall.py +++ b/tests/test_aniblastall.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# (c) University of Strathclyde 2019-2021 +# (c) University of Strathclyde 2019-2022 # Author: Bailey Harrington # # Contact: @@ -16,7 +16,7 @@ # # The MIT License # -# Copyright (c) 2019-2021 University of Strathclyde +# Copyright (c) 2019-2022 University of Strathclyde # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -49,7 +49,7 @@ import pytest # noqa: F401 # pylint: disable=unused-import import unittest -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_frame_equal from pyani import anib, pyani_files # probably don't need anib from pyani import aniblastall @@ -263,8 +263,8 @@ def test_parse_legacy_blastdir(aniblastall_output_dir): aniblastall_output_dir.legacyblastdir, orglengths, fraglengths ) assert_frame_equal( - result.percentage_identity.sort_index(1).sort_index(), - aniblastall_output_dir.legacyblastresult.sort_index(1).sort_index(), + result.percentage_identity.sort_index(axis=1).sort_index(), + aniblastall_output_dir.legacyblastresult.sort_index(axis=1).sort_index(), ) From bdefef028a338689d9c9595307ec622e0498bdbe Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Tue, 7 Jun 2022 18:31:08 +0100 Subject: [PATCH 75/75] remove unused import If an import is no longer required, it should be removed, rather than commented as "probably not required" or similar. --- tests/test_aniblastall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py index b04beb71..e534c9c4 100644 --- a/tests/test_aniblastall.py +++ b/tests/test_aniblastall.py @@ -51,7 +51,7 @@ from pandas.testing import assert_frame_equal -from pyani import anib, pyani_files # probably don't need anib +from pyani import pyani_files from pyani import aniblastall # Create object for accessing unittest assertions