From b64231c8a13944b8a183a5ad086d1718ea8040ee Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Thu, 15 Jul 2021 20:27:51 +0100
Subject: [PATCH 01/75] Initial split of v2 ANIb code into `anib.py` and
 `aniblastall.py` files

Including changing the parameter `mode` to `method`
---
 pyani/anib.py        | 171 ++++++---------------
 pyani/aniblastall.py | 355 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 398 insertions(+), 128 deletions(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 80b2947c..89848e49 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -224,7 +224,7 @@ def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict:
 
 
 def make_blastcmd_builder(
-    mode: str,
+    method: str,
     outdir: Path,
     format_exe: Optional[Path] = None,
     blast_exe: Optional[Path] = None,
@@ -232,32 +232,21 @@ def make_blastcmd_builder(
 ) -> BLASTcmds:
     """Return BLASTcmds object for construction of BLAST commands.
 
-    :param mode:  str, the kind of ANIb analysis (ANIb or ANIblastall)
+    :param method:  str, the kind of ANI analysis (ANIb)
     :param outdir:
     :param format_exe:
     :param blast_exe:
     :param prefix:
     """
-    if mode == "ANIb":  # BLAST/formatting executable depends on mode
-        blastcmds = BLASTcmds(
-            BLASTfunctions(construct_makeblastdb_cmd, construct_blastn_cmdline),
-            BLASTexes(
-                format_exe or pyani_config.MAKEBLASTDB_DEFAULT,
-                blast_exe or pyani_config.BLASTN_DEFAULT,
-            ),
-            prefix,
-            outdir,
-        )
-    else:
-        blastcmds = BLASTcmds(
-            BLASTfunctions(construct_formatdb_cmd, construct_blastall_cmdline),
-            BLASTexes(
-                format_exe or pyani_config.FORMATDB_DEFAULT,
-                blast_exe or pyani_config.BLASTALL_DEFAULT,
-            ),
-            prefix,
-            outdir,
-        )
+    blastcmds = BLASTcmds(
+        BLASTfunctions(construct_makeblastdb_cmd, construct_blastn_cmdline),
+        BLASTexes(
+            format_exe or pyani_config.MAKEBLASTDB_DEFAULT,
+            blast_exe or pyani_config.BLASTN_DEFAULT,
+        ),
+        prefix,
+        outdir,
+    )
     return blastcmds
 
 
@@ -271,9 +260,6 @@ def make_job_graph(
     :param fragfiles:  list of paths to fragmented input FASTA files
     :param blastcmds:
 
-    By default, will run ANIb - it *is* possible to make a mess of passing the
-    wrong executable for the mode you're using.
-
     All items in the returned graph list are BLAST executable jobs that must
     be run *after* the corresponding database creation. The Job objects
     corresponding to the database creation are contained as dependencies.
@@ -321,24 +307,19 @@ def generate_blastdb_commands(
     filenames: List[Path],
     outdir: Path,
     blastdb_exe: Optional[Path] = None,
-    mode: str = "ANIb",
 ) -> List[Tuple[str, Path]]:
-    """Return list of makeblastdb command-lines for ANIb/ANIblastall.
+    """Return list of makeblastdb command-lines for ANIb.
 
     :param filenames:  a list of paths to input FASTA files
     :param outdir:  path to output directory
     :param blastdb_exe:  path to the makeblastdb executable
-    :param mode:  str, ANIb analysis type (ANIb or ANIblastall)
+    :param method:  str, ANIb analysis type (ANIb)
     """
-    if mode == "ANIb":
-        construct_db_cmdline = construct_makeblastdb_cmd
-    else:
-        construct_db_cmdline = construct_formatdb_cmd
     if blastdb_exe is None:
-        cmdlines = [construct_db_cmdline(fname, outdir) for fname in filenames]
+        cmdlines = [construct_makeblastdb_cmd(fname, outdir) for fname in filenames]
     else:
         cmdlines = [
-            construct_db_cmdline(fname, outdir, blastdb_exe) for fname in filenames
+            construct_makeblastdb_cmd(fname, outdir, blastdb_exe) for fname in filenames
         ]
     return cmdlines
 
@@ -360,58 +341,38 @@ def construct_makeblastdb_cmd(
     )
 
 
-# Generate single makeblastdb command line
-def construct_formatdb_cmd(
-    filename: Path, outdir: Path, blastdb_exe: Path = pyani_config.FORMATDB_DEFAULT
-) -> Tuple[str, Path]:
-    """Return formatdb command and path to output file.
-
-    :param filename:  Path, input filename
-    :param outdir:  Path, path to output directory
-    :param blastdb_exe:  Path, path to the formatdb executable
-    """
-    newfilename = outdir / filename.name
-    shutil.copy(filename, newfilename)
-    return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename)
-
-
 # Generate list of BLASTN command lines from passed filenames
 def generate_blastn_commands(
     filenames: List[Path],
     outdir: Path,
     blast_exe: Optional[Path] = None,
-    mode: str = "ANIb",
 ) -> List[str]:
-    """Return a list of blastn command-lines for ANIm.
+    """Return a list of blastn command-lines for ANIb.
 
     :param filenames:  a list of paths to fragmented input FASTA files
     :param outdir:  path to output directory
     :param blastn_exe:  path to BLASTN executable
-    :param mode:  str, analysis type (ANIb or ANIblastall)
+    :param method:  str, analysis type (ANIb)
 
     Assumes that the fragment sequence input filenames have the form
     ACCESSION-fragments.ext, where the corresponding BLAST database filenames
     have the form ACCESSION.ext. This is the convention followed by the
     fragment_FASTA_files() function above.
     """
-    if mode == "ANIb":
-        construct_blast_cmdline = construct_blastn_cmdline  # type: Callable
-    else:
-        construct_blast_cmdline = construct_blastall_cmdline
     cmdlines = []
     for idx, fname1 in enumerate(filenames[:-1]):
         dbname1 = Path(str(fname1).replace("-fragments", ""))
         for fname2 in filenames[idx + 1 :]:
             dbname2 = Path(str(fname2).replace("-fragments", ""))
             if blast_exe is None:
-                cmdlines.append(construct_blast_cmdline(fname1, dbname2, outdir))
-                cmdlines.append(construct_blast_cmdline(fname2, dbname1, outdir))
+                cmdlines.append(construct_blastn_cmdline(fname1, dbname2, outdir))
+                cmdlines.append(construct_blastn_cmdline(fname2, dbname1, outdir))
             else:
                 cmdlines.append(
-                    construct_blast_cmdline(fname1, dbname2, outdir, blast_exe)
+                    construct_blastn_cmdline(fname1, dbname2, outdir, blast_exe)
                 )
                 cmdlines.append(
-                    construct_blast_cmdline(fname2, dbname1, outdir, blast_exe)
+                    construct_blastn_cmdline(fname2, dbname1, outdir, blast_exe)
                 )
     return cmdlines
 
@@ -440,33 +401,12 @@ def construct_blastn_cmdline(
     )
 
 
-# Generate single BLASTALL command line
-def construct_blastall_cmdline(
-    fname1: Path,
-    fname2: Path,
-    outdir: Path,
-    blastall_exe: Path = pyani_config.BLASTALL_DEFAULT,
-) -> str:
-    """Return single blastall command.
-
-    :param fname1:
-    :param fname2:
-    :param outdir:
-    :param blastall_exe:  str, path to BLASTALL executable
-    """
-    prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}"
-    return (
-        f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {fname1} -d {fname2} "
-        "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
-    )
-
-
 # Process pairwise BLASTN output
 def process_blast(
     blast_dir: Path,
     org_lengths: Dict,
     fraglengths: Dict,
-    mode: str = "ANIb",
+    method: str = "ANIb",
     logger: Optional[Logger] = None,
 ) -> ANIResults:
     """Return tuple of ANIb results for .blast_tab files in the output dir.
@@ -475,7 +415,7 @@ def process_blast(
     :param org_lengths:  Dict, the base count for each input sequence
     :param fraglengths:  dictionary of query sequence fragment lengths, only
         needed for BLASTALL output
-    :param mode:  str, analysis type (ANIb or ANIblastall)
+    :param method:  str, analysis type (ANIb)
     :param logger:  a logger for messages
 
     Returns the following pandas dataframes in an ANIResults object;
@@ -492,7 +432,7 @@ def process_blast(
     # Process directory to identify input files
     blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab")
     # Hold data in ANIResults object
-    results = ANIResults(list(org_lengths.keys()), mode)
+    results = ANIResults(list(org_lengths.keys()), method)
 
     # Fill diagonal NA values for alignment_length with org_lengths
     for org, length in list(org_lengths.items()):
@@ -521,7 +461,7 @@ def process_blast(
                     blastfile,
                 )
             continue
-        resultvals = parse_blast_tab(blastfile, fraglengths, mode)
+        resultvals = parse_blast_tab(blastfile, fraglengths, method)
         query_cover = float(resultvals[0]) / org_lengths[qname]
 
         # Populate dataframes: when assigning data, we need to note that
@@ -535,15 +475,13 @@ def process_blast(
 
 
 # Parse BLASTALL output to get total alignment length and mismatches
-def parse_blast_tab(
-    filename: Path, fraglengths: Dict, mode: str = "ANIb"
-) -> Tuple[int, int, int]:
+def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
     """Return (alignment length, similarity errors, mean_pid) tuple.
 
     :param filename:  Path, path to .blast_tab file
     :param fraglengths:  Optional[Dict], dictionary of fragment lengths for each
         genome.
-    :param mode:  str, analysis type (ANIb or ANIblastall)
+    :param method:  str, analysis type (ANIb)
 
     Calculate the alignment length and total number of similarity errors (as
     we would with ANIm), as well as the Goris et al.-defined mean identity
@@ -556,41 +494,23 @@ def parse_blast_tab(
     over an alignable region of at least 70% of their length.
     '''
     """
-    # Assuming that the filename format holds org1_vs_org2.blast_tab:
-    qname = filename.stem.split("_vs_")[0]
     # Load output as dataframe
-    if mode == "ANIblastall":
-        qfraglengths = fraglengths[qname]
-        columns = [
-            "sid",
-            "blast_pid",
-            "blast_alnlen",
-            "blast_mismatch",
-            "blast_gaps",
-            "q_start",
-            "q_end",
-            "s_start",
-            "s_end",
-            "e_Value",
-            "bit_score",
-        ]
-    else:
-        columns = [
-            "sbjct_id",
-            "blast_alnlen",
-            "blast_mismatch",
-            "blast_pid",
-            "blast_identities",
-            "qlen",
-            "slen",
-            "q_start",
-            "q_end",
-            "s_start",
-            "s_end",
-            "blast_pos",
-            "ppos",
-            "blast_gaps",
-        ]
+    columns = [
+        "sbjct_id",
+        "blast_alnlen",
+        "blast_mismatch",
+        "blast_pid",
+        "blast_identities",
+        "qlen",
+        "slen",
+        "q_start",
+        "q_end",
+        "s_start",
+        "s_end",
+        "blast_pos",
+        "ppos",
+        "blast_gaps",
+    ]
     # We may receive an empty BLASTN output file, if there are no significant
     # regions of homology. This causes pandas to throw an error on CSV import.
     # To get past this, we create an empty dataframe with the appropriate
@@ -600,11 +520,6 @@ def parse_blast_tab(
         data.columns = columns
     except pd.io.common.EmptyDataError:
         data = pd.DataFrame(columns=columns)
-    # Add new column for fragment length, only for BLASTALL
-    if mode == "ANIblastall":
-        data["qlen"] = pd.Series(
-            [qfraglengths[idx] for idx in data.index], index=data.index
-        )
     # Add new columns for recalculated alignment length, proportion, and
     # percentage identity
     data["ani_alnlen"] = data["blast_alnlen"] - data["blast_gaps"]
diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index a6c0a345..964e3ee5 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -38,11 +38,18 @@
 import platform
 import re
 import subprocess
+import shutil
 
+import pandas as pd
 
 from pathlib import Path
+from typing import List, Tuple, Dict, Optional
+
+from Bio import SeqIO
 
 from . import pyani_config
+from . import pyani_jobs
+from .pyani_tools import BLASTcmds, BLASTfunctions, BLASTexes
 
 
 def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str:
@@ -72,3 +79,351 @@ def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str:
         r"(?<=blastall\s)[0-9\.]*", str(result.stderr, "utf-8")
     ).group()
     return f"{platform.system()}_{version}"
+
+
+# Divide input FASTA sequences into fragments
+def fragment_fasta_files(
+    infiles: List[Path], outdirname: Path, fragsize: int
+) -> Tuple[List, Dict]:
+    """Chop sequences of the passed files into fragments, return filenames.
+
+    :param infiles:  collection of paths to each input sequence file
+    :param outdirname:  Path, path to output directory
+    :param fragsize:  Int, the size of sequence fragments
+
+    Takes every sequence from every file in infiles, and splits them into
+    consecutive fragments of length fragsize, (with any trailing sequences
+    being included, even if shorter than fragsize), writing the resulting
+    set of sequences to a file with the same name in the specified
+    output directory.
+
+    All fragments are named consecutively and uniquely (within a file) as
+    fragNNNNN. Sequence description fields are retained.
+
+    Returns a tuple ``(filenames, fragment_lengths)`` where ``filenames`` is a
+    list of paths to the fragment sequence files, and ``fragment_lengths`` is
+    a dictionary of sequence fragment lengths, keyed by the sequence files,
+    with values being a dictionary of fragment lengths, keyed by fragment
+    IDs.
+    """
+    outfnames = []
+    for fname in infiles:
+        outfname = outdirname / f"{fname.stem}-fragments{fname.suffix}"
+        outseqs = []
+        count = 0
+        for seq in SeqIO.parse(fname, "fasta"):
+            idx = 0
+            while idx < len(seq):
+                count += 1
+                newseq = seq[idx : idx + fragsize]
+                newseq.id = "frag%05d" % count
+                outseqs.append(newseq)
+                idx += fragsize
+        outfnames.append(outfname)
+        SeqIO.write(outseqs, outfname, "fasta")
+    return outfnames, get_fraglength_dict(outfnames)
+
+
+# Get lengths of all sequences in all files
+def get_fraglength_dict(fastafiles: List[Path]) -> Dict:
+    """Return dictionary of sequence fragment lengths, keyed by query name.
+
+    :param fastafiles:  list of paths to FASTA input whole sequence files
+
+    Loops over input files and, for each, produces a dictionary with fragment
+    lengths, keyed by sequence ID. These are returned as a dictionary with
+    the keys being query IDs derived from filenames.
+    """
+    fraglength_dict = {}
+    for filename in fastafiles:
+        qname = filename.stem.split("-fragments")[0]
+        fraglength_dict[qname] = get_fragment_lengths(filename)
+    return fraglength_dict
+
+
+# Get lengths of all sequences in a file
+def get_fragment_lengths(fastafile: Path) -> Dict:
+    """Return dictionary of sequence fragment lengths, keyed by fragment ID.
+
+    :param fastafile:
+
+    Biopython's SeqIO module is used to parse all sequences in the FASTA
+    file.
+
+    NOTE: ambiguity symbols are not discounted.
+    """
+    fraglengths = {}
+    for seq in SeqIO.parse(fastafile, "fasta"):
+        fraglengths[seq.id] = len(seq)
+    return fraglengths
+
+
+# Create dictionary of database building commands, keyed by dbname
+def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict:
+    """Return dictionary of db-building commands, keyed by dbname.
+
+    :param infiles:
+    :param blastcmds:
+    """
+    dbjobdict = {}  # Dict of database construction jobs, keyed by filename
+    # Create dictionary of database building jobs, keyed by db name
+    # defining jobnum for later use as last job index used
+    for idx, fname in enumerate(infiles):
+        dbjobdict[blastcmds.get_db_name(fname)] = pyani_jobs.Job(
+            f"{blastcmds.prefix}_db_{idx:06}", blastcmds.build_db_cmd(fname)
+        )
+    return dbjobdict
+
+
+def make_blastcmd_builder(
+    method: str,
+    outdir: Path,
+    format_exe: Optional[Path] = None,
+    blast_exe: Optional[Path] = None,
+    prefix: str = "ANIBLAST",
+) -> BLASTcmds:
+    """Return BLASTcmds object for construction of BLAST commands.
+
+    :param method:  str, the kind of ANI analysis (ANIblastall)
+    :param outdir:
+    :param format_exe:
+    :param blast_exe:
+    :param prefix:
+    """
+    blastcmds = BLASTcmds(
+        BLASTfunctions(construct_formatdb_cmd, construct_blastall_cmdline),
+        BLASTexes(
+            format_exe or pyani_config.FORMATDB_DEFAULT,
+            blast_exe or pyani_config.BLASTALL_DEFAULT,
+        ),
+        prefix,
+        outdir,
+    )
+    return blastcmds
+
+
+# Make a dependency graph of BLAST commands
+def make_job_graph(
+    infiles: List[Path], fragfiles: List[Path], blastcmds: BLASTcmds
+) -> List[pyani_jobs.Job]:
+    """Return job dependency graph, based on the passed input sequence files.
+
+    :param infiles:  list of paths to input FASTA files
+    :param fragfiles:  list of paths to fragmented input FASTA files
+    :param blastcmds:
+
+    All items in the returned graph list are BLAST executable jobs that must
+    be run *after* the corresponding database creation. The Job objects
+    corresponding to the database creation are contained as dependencies.
+    How those jobs are scheduled depends on the scheduler (see
+    run_multiprocessing.py, run_sge.py)
+    """
+    joblist = []  # Holds list of job dependency graphs
+
+    # Get dictionary of database-building jobs
+    dbjobdict = build_db_jobs(infiles, blastcmds)
+
+    # Create list of BLAST executable jobs, with dependencies
+    jobnum = len(dbjobdict)
+    for idx, fname1 in enumerate(fragfiles[:-1]):
+        for fname2 in fragfiles[idx + 1 :]:
+            jobnum += 1
+            jobs = [
+                pyani_jobs.Job(
+                    f"{blastcmds.prefix}_exe_{jobnum:06d}_a",
+                    blastcmds.build_blast_cmd(
+                        fname1, fname2.parent / fname2.name.replace("-fragments", "")
+                    ),
+                ),
+                pyani_jobs.Job(
+                    f"{blastcmds.prefix}_exe_{jobnum:06d}_b",
+                    blastcmds.build_blast_cmd(
+                        fname2, fname1.parent / fname1.name.replace("-fragments", "")
+                    ),
+                ),
+            ]
+            jobs[0].add_dependency(
+                dbjobdict[fname1.parent / fname1.name.replace("-fragments", "")]
+            )
+            jobs[1].add_dependency(
+                dbjobdict[fname2.parent / fname2.name.replace("-fragments", "")]
+            )
+            joblist.extend(jobs)
+
+    # Return the dependency graph
+    return joblist
+
+
+# Generate list of makeblastdb command lines from passed filenames
+def generate_blastdb_commands(
+    filenames: List[Path],
+    outdir: Path,
+    blastdb_exe: Optional[Path] = None,
+) -> List[Tuple[str, Path]]:
+    """Return list of makeblastdb command-lines for ANIblastall.
+
+    :param filenames:  a list of paths to input FASTA files
+    :param outdir:  path to output directory
+    :param blastdb_exe:  path to the makeblastdb executable
+    :param method:  str, ANI analysis type (ANIblastall)
+    """
+    if blastdb_exe is None:
+        cmdlines = [construct_formatdb_cmd(fname, outdir) for fname in filenames]
+    else:
+        cmdlines = [
+            construct_formatdb_cmd(fname, outdir, blastdb_exe) for fname in filenames
+        ]
+    return cmdlines
+
+
+# Generate single makeblastdb command line
+def construct_formatdb_cmd(
+    filename: Path, outdir: Path, blastdb_exe: Path = pyani_config.FORMATDB_DEFAULT
+) -> Tuple[str, Path]:
+    """Return formatdb command and path to output file.
+
+    :param filename:  Path, input filename
+    :param outdir:  Path, path to output directory
+    :param blastdb_exe:  Path, path to the formatdb executable
+    """
+    newfilename = outdir / filename.name
+    shutil.copy(filename, newfilename)
+    return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename)
+
+
+# Generate list of BLASTN command lines from passed filenames
+def generate_blastn_commands(
+    filenames: List[Path],
+    outdir: Path,
+    blast_exe: Optional[Path] = None,
+) -> List[str]:
+    """Return a list of blastn command-lines for ANIblastall.
+
+    :param filenames:  a list of paths to fragmented input FASTA files
+    :param outdir:  path to output directory
+    :param blastn_exe:  path to BLASTN executable
+    :param method:  str, analysis type (ANIblastall)
+
+    Assumes that the fragment sequence input filenames have the form
+    ACCESSION-fragments.ext, where the corresponding BLAST database filenames
+    have the form ACCESSION.ext. This is the convention followed by the
+    fragment_FASTA_files() function above.
+    """
+    cmdlines = []
+    for idx, fname1 in enumerate(filenames[:-1]):
+        dbname1 = Path(str(fname1).replace("-fragments", ""))
+        for fname2 in filenames[idx + 1 :]:
+            dbname2 = Path(str(fname2).replace("-fragments", ""))
+            if blast_exe is None:
+                cmdlines.append(construct_blastall_cmdline(fname1, dbname2, outdir))
+                cmdlines.append(construct_blastall_cmdline(fname2, dbname1, outdir))
+            else:
+                cmdlines.append(
+                    construct_blastall_cmdline(fname1, dbname2, outdir, blast_exe)
+                )
+                cmdlines.append(
+                    construct_blastall_cmdline(fname2, dbname1, outdir, blast_exe)
+                )
+    return cmdlines
+
+
+# Generate single BLASTALL command line
+def construct_blastall_cmdline(
+    fname1: Path,
+    fname2: Path,
+    outdir: Path,
+    blastall_exe: Path = pyani_config.BLASTALL_DEFAULT,
+) -> str:
+    """Return single blastall command.
+
+    :param fname1:
+    :param fname2:
+    :param outdir:
+    :param blastall_exe:  str, path to BLASTALL executable
+    """
+    prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}"
+    return (
+        f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {fname1} -d {fname2} "
+        "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
+    )
+
+
+# Parse BLASTALL output to get total alignment length and mismatches
+def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
+    """Return (alignment length, similarity errors, mean_pid) tuple.
+
+    :param filename:  Path, path to .blast_tab file
+    :param fraglengths:  Optional[Dict], dictionary of fragment lengths for each
+        genome.
+    :param method:  str, analysis type (ANIblastall)
+
+    Calculate the alignment length and total number of similarity errors (as
+    we would with ANIm), as well as the Goris et al.-defined mean identity
+    of all valid BLAST matches for the passed BLASTALL alignment .blast_tab
+    file.
+
+    '''ANI between the query genome and the reference genome was calculated as
+    the mean identity of all BLASTN matches that showed more than 30% overall
+    sequence identity (recalculated to an identity along the entire sequence)
+    over an alignable region of at least 70% of their length.
+    '''
+    """
+    # Assuming that the filename format holds org1_vs_org2.blast_tab:
+    qname = filename.stem.split("_vs_")[0]
+    # Load output as dataframe
+    qfraglengths = fraglengths[qname]
+    columns = [
+        "sid",
+        "blast_pid",
+        "blast_alnlen",
+        "blast_mismatch",
+        "blast_gaps",
+        "q_start",
+        "q_end",
+        "s_start",
+        "s_end",
+        "e_Value",
+        "bit_score",
+    ]
+    # We may receive an empty BLASTN output file, if there are no significant
+    # regions of homology. This causes pandas to throw an error on CSV import.
+    # To get past this, we create an empty dataframe with the appropriate
+    # columns.
+    try:
+        data = pd.read_csv(filename, header=None, sep="\t", index_col=0)
+        data.columns = columns
+    except pd.io.common.EmptyDataError:
+        data = pd.DataFrame(columns=columns)
+    # Add new column for fragment length, only for BLASTALL
+    data["qlen"] = pd.Series(
+        [qfraglengths[idx] for idx in data.index], index=data.index
+    )
+    # Add new columns for recalculated alignment length, proportion, and
+    # percentage identity
+    data["ani_alnlen"] = data["blast_alnlen"] - data["blast_gaps"]
+    data["ani_alnids"] = data["ani_alnlen"] - data["blast_mismatch"]
+    data["ani_coverage"] = data["ani_alnlen"] / data["qlen"]
+    data["ani_pid"] = data["ani_alnids"] / data["qlen"]
+    # Filter rows on 'ani_coverage' > 0.7, 'ani_pid' > 0.3
+    filtered = data[(data["ani_coverage"] > 0.7) & (data["ani_pid"] > 0.3)]
+    # Dedupe query hits, so we only take the best hit
+    filtered = filtered.groupby(filtered.index).first()
+    # Replace NaNs with zero
+    filtered = filtered.fillna(value=0)  # Needed if no matches
+    # The ANI value is then the mean percentage identity.
+    # We report total alignment length and the number of similarity errors
+    # (mismatches and gaps), as for ANIm
+    # NOTE: We report the mean of 'blast_pid' for concordance with JSpecies
+    # Despite this, the concordance is not exact. Manual inspection during
+    # development indicated that a handful of fragments are differentially
+    # filtered out in JSpecies and this script. This is often on the basis
+    # of rounding differences (e.g. coverage being close to 70%).
+    # NOTE: If there are no hits, then ani_pid will be nan - we replace this
+    # with zero if that happens
+    ani_pid = filtered["blast_pid"].mean()
+    if pd.isnull(ani_pid):  # Happens if there are no matches in ANIb
+        ani_pid = 0
+    aln_length = filtered["ani_alnlen"].sum()
+    sim_errors = filtered["blast_mismatch"].sum() + filtered["blast_gaps"].sum()
+    filtered.to_csv(Path(filename).with_suffix(".blast_tab.dataframe"), sep="\t")
+    return aln_length, sim_errors, ani_pid

From 39bdde19db3be4403348c8ab4fee51114e16ae6c Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 21 Jul 2021 19:54:58 +0100
Subject: [PATCH 02/75] Updated comments/logging

---
 pyani/scripts/subcommands/subcmd_anib.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index d16cdeb7..d46547c5 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -89,11 +89,11 @@ def subcmd_anib(args: Namespace) -> None:
 
     The calculated values are stored in the local SQLite3 database.
     """
+    # Create logger
     logger = logging.getLogger(__name__)
 
-    logger.info(
-        termcolor("Running ANIm analysis", "red")
-    )  # announce that we're starting
+    # Announce the analysis
+    logger.info(termcolor("Running ANIb analysis", "red"))
 
     # Get BLAST+ version - this will be used in the database entries
     blastn_version = anib.get_version(args.blastn_exe)
@@ -102,7 +102,7 @@ def subcmd_anib(args: Namespace) -> None:
     # Use provided name, or make new one for this analysis
     start_time = datetime.datetime.now()
     name = args.name or "_".join(["ANIb", start_time.isoformat()])
-    logger.info("Analysis name: %s", name)
+    logger.info(termcolor("Analysis name: %s", "cyan"), name)
 
     # Connect to existing database (which may be "clean" or have old analyses)
     logger.debug("Connecting to database %s", args.dbpath)
@@ -225,7 +225,7 @@ def subcmd_anib(args: Namespace) -> None:
                 existingfiles[0],
             )
         else:
-            logger.debug(f"\tIdentified no existing output files")
+            logger.debug("\tIdentified no existing output files")
     else:
         existingfiles = list()
         logger.debug("\tAssuming no pre-existing output files")
@@ -248,7 +248,7 @@ def subcmd_anib(args: Namespace) -> None:
     joblist = generate_joblist(
         comparisons_to_run, existingfiles, fragfiles, fraglens, args
     )
-    logger.debug(f"...created %s blastn jobs", len(joblist))
+    logger.debug("...created %s blastn jobs", len(joblist))
 
     raise NotImplementedError
 

From 7e9ebe250922402ae913b850585d1ba61ad7afdd Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 21 Jul 2021 19:55:50 +0100
Subject: [PATCH 03/75] Added `sysexit` when adding genomes to the database
 fails

---
 pyani/scripts/subcommands/subcmd_anib.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index d46547c5..e412eab4 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -140,6 +140,7 @@ def subcmd_anib(args: Namespace) -> None:
         logger.error(
             "Could not add genomes to database for run %s (exiting)", run, exc_info=True
         )
+        raise SystemExit(1)
     logger.debug("\t...added genome IDs: %s", genome_ids)
 
     # Get list of genomes for this analysis from the database

From 70abeb91322379ad3888590b175da103d058b01d Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 21 Jul 2021 20:20:06 +0100
Subject: [PATCH 04/75] Added initial versions of functions and comments about
 needed code

`run_anib_jobs()`, `update_comparison_results()`
---
 pyani/scripts/subcommands/subcmd_anib.py | 110 ++++++++++++++++++++++-
 1 file changed, 108 insertions(+), 2 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index e412eab4..5394558e 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -47,7 +47,7 @@
 from argparse import Namespace
 from itertools import permutations
 from pathlib import Path
-from typing import List, Tuple
+from typing import List, NamedTuple, Tuple
 
 from Bio import SeqIO
 from tqdm import tqdm
@@ -55,16 +55,33 @@
 from pyani import anib
 from pyani.pyani_files import collect_existing_output
 from pyani.pyani_orm import (
-    PyaniORMException,
     add_run,
     add_run_genomes,
+    add_blastdb,
+    Comparison,
     filter_existing_comparisons,
     get_session,
+    pyani_jobs,
+    PyaniORMException,
     update_comparison_matrices,
 )
 from pyani.pyani_tools import termcolor
 
 
+# Convenience struct describing a pairwise comparison job for the SQLAlchemy
+# implementation
+class ComparisonJob(NamedTuple):
+
+    """Pairwise comparison job for the SQLAlchemy implementation"""
+
+    query: str
+    subject: str
+    blastcmd: str
+    outfile: Path
+    fragsize: int
+    job: pyani_jobs.Job
+
+
 def subcmd_anib(args: Namespace) -> None:
     """Perform ANIb on all genome files in an input directory.
 
@@ -164,6 +181,12 @@ def subcmd_anib(args: Namespace) -> None:
     os.makedirs(fragdir, exist_ok=True)
     os.makedirs(blastdbdir, exist_ok=True)
 
+    # jobgraph = anib.make_job_graph(
+    #    infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir)
+    # )
+
+    # Create dictionary of database building commands, keyed by dbname
+
     # Create a new sequence fragment file and a new BLAST+ database for each input genome,
     # and add this data to the database as a row in BlastDB
     logger.info("Creating input sequence fragment files")
@@ -304,3 +327,86 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path
     fragpath = outdir / f"{inpath.stem}-fragments.fasta"
     SeqIO.write(outseqs, fragpath, "fasta")
     return fragpath, json.dumps(sizedict)
+
+
+# def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
+#    """Pass ANIb blastn jobs to the scheduler.
+#
+#    :param joblist:           list of ComparisonJob namedtuples
+#    :param args:              command-line arguments for the run
+#    """
+#    logger = logging.getLogger(__name__)
+#    logger.debug("Scheduler: %s", args.scheduler)
+#
+#    # Entries with None seen in recovery mode:
+#    jobs = [_.job for _ in joblist if _.job]
+#
+#    if args.scheduler == "multiprocessing":
+#        logger.info("Running jobs with multiprocessing")
+#        if not args.workers:
+#            logger.debug("(using maximum number of worker threads)")
+#        else:
+#            logger.debug("(using %d worker threads, if available)", args.workers)
+#        cumval = run_mp.run_dependency_graph(jobs, workers=args.workers)
+#        if cumval > 0:
+#            logger.error(
+#                "At least one blastn comparison failed. Please investigate (exiting)"
+#            )
+#            raise PyaniException("Multiprocessing run failed in ANIb")
+#        logger.info("Multiprocessing run completed without error")
+#    elif args.scheduler.lower() == "sge":
+#        logger.info("Running jobs with SGE")
+#        logger.debug("Setting jobarray group size to %d", args.sgegroupsize)
+#        logger.debug("Joblist contains %d jobs", len(joblist))
+#        run_sge.run_dependency_graph(
+#            jobs,
+#            jgprefix=args.jobprefix,
+#            sgegroupsize=args.sgegroupsize,
+#            sgeargs=args.sgeargs,
+#        )
+#    else:
+#        logger.error(termcolor("Scheduler %s not recognised", "red"), args.scheduler)
+#        raise SystemError(1)
+#
+#
+# def update_comparison_results(
+#    joblist: List[ComparisonJob], run, session, blastn_version: str, args: Namespace
+# ) -> None:
+#    """Update the Comparision table with the completed result set.
+#
+#    :param joblist:         list of ComparisonJob namedtuples
+#    :param run:             Run ORM object for the current ANIb run
+#    :param session:         active pyanidb session via ORM
+#    :param blastn_version:  version of blastn used for the comparison
+#    :param args:            command-line arguments for this run
+#
+#    The Comparison table stores individual comparison results, one per row.
+#    """
+#    logger = logging.getLogger(__name__)
+#
+#    # Add individual results to Comparison table
+#    for job in tqdm(joblist, disable=args.disable_tqdm):
+#        logger.debug("\t%s vs %s", job.query.description, job.subject.description)
+#        aln_length, sim_errs, ani_pid = anib.process_blast_tab(job.outfile)
+#        qcov = aln_length / job.query.length
+#        scov = aln_length / job.subject.length
+#        run.comparisons.append(
+#            Comparison(
+#                query=job.query,
+#                subject=job.subject,
+#                aln_length=aln_length,
+#                sim_errs=sim_errs,
+#                identity=ani_pid,
+#                cov_query=qcov,
+#                cov_subject=scov,
+#                program="blastn",
+#                version=blastn_version,
+#                fragsize=job.fragsize,
+#                maxmatch=False,
+#            )
+#        )
+#
+#    # Populate db
+#    logger.debug("Committing results to database")
+#    session.commit()
+#

From 9dcade60d062a24ef3aae2fb9e2bcda199501e7a Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 21 Jul 2021 20:20:41 +0100
Subject: [PATCH 05/75] Rename `fraglens` to `fragsizes` for consistency with
 `anib.py`

---
 pyani/scripts/subcommands/subcmd_anib.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 5394558e..6555b96e 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -191,13 +191,12 @@ def subcmd_anib(args: Namespace) -> None:
     # and add this data to the database as a row in BlastDB
     logger.info("Creating input sequence fragment files")
     for genome in genomes:
-        fragpath, fraglengths = fragment_fasta_file(
+        fragpath, fragsizes = fragment_fasta_file(
             Path(str(genome.path)), Path(str(fragdir)), args.fragsize
         )
-        print(fragpath, len(fraglengths))
-        # blastdb = add_blastdb(
-        #     session, genome, run, fragpath, dbpath, fraglengths, dbcmd
-        # )
+        print(fragpath, len(fragsizes))
+
+        # blastdb = add_blastdb(session, genome, run, fragpath, dbpath, fragsizes, dbcmd)
 
     raise NotImplementedError
 
@@ -281,7 +280,7 @@ def generate_joblist(
     comparisons: List,
     existingfiles: List,
     fragfiles: List,
-    fraglens: List,
+    fragsizes: List,
     args: Namespace,
 ) -> NotImplementedError:
     """Return list of ComparisonJobs.
@@ -289,7 +288,7 @@ def generate_joblist(
     :param comparisons:  list of (Genome, Genome) tuples for which comparisons are needed
     :param existingfiles:  list of pre-existing BLASTN+ outputs
     :param fragfiles:
-    :param fraglens:
+    :param fragsizes:
     :param args:  Namespace, command-line arguments
     """
     # logger = logging.getLogger(__name__)

From c660e63c61f9757670f2e6a2cd754bacec15d0e9 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Mon, 13 Sep 2021 23:53:11 +0100
Subject: [PATCH 06/75] Add `add_blastdb()` to `pyani_orm.py`

---
 pyani/pyani_orm.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/pyani/pyani_orm.py b/pyani/pyani_orm.py
index bb4aeb9e..e2b16c26 100644
--- a/pyani/pyani_orm.py
+++ b/pyani/pyani_orm.py
@@ -312,6 +312,39 @@ def __repr__(self) -> str:
         return "<Comparison(comparison_id={})>".format(self.comparison_id)
 
 
+def add_blastdb(session, genome, run, fragpath, dbpath, fragsizes, dbcmd) -> None:
+    """Update the Blastdb table with a fragment dictionary for genome.
+
+    :param session:         active pyanidb session via ORM
+    :param genome:          Genome ORM object
+    :param run:             Run ORM object for the current ANIb run
+    :param fragpath:        Path to the fragment sequence file for Genome
+    :param dbpath:          Path to the database file
+    :param fragsizes:       fragment size dictionary for Genome
+    :param dbcmd:           command used to generate database
+
+    """
+    try:
+        blastdb = BlastDB(
+            genome_id=genome.genome_id,
+            run_id=run.run_id,
+            fragpath=str(fragpath),
+            dbpath=str(dbpath),
+            fragsizes=fragsizes,
+            dbcmd=dbcmd,
+        )
+    except Exception:
+        raise PyaniORMException(
+            f"Could not create {genome} blast database with command line: {dbcmd}"
+        )
+    try:
+        session.add(blastdb)
+        session.commit()
+    except Exception:
+        raise PyaniORMException(f"Could not add blastdb for {genome} to the database")
+    return blastdb
+
+
 def create_db(dbpath: Path) -> None:
     """Create an empty pyani SQLite3 database at the passed path.
 

From bf4fa20f49a4e1070d4ce2b89ab2f7bca9ca58e3 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Mon, 13 Sep 2021 23:54:10 +0100
Subject: [PATCH 07/75] Change default for `maxmatch` from `None` to `False`

If this is not a boolean, it prevents sqlite from recognising duplicate
entries
---
 pyani/pyani_orm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyani/pyani_orm.py b/pyani/pyani_orm.py
index e2b16c26..c2378fe4 100644
--- a/pyani/pyani_orm.py
+++ b/pyani/pyani_orm.py
@@ -433,7 +433,7 @@ def filter_existing_comparisons(
     program,
     version,
     fragsize: Optional[int] = None,
-    maxmatch: Optional[bool] = None,
+    maxmatch: Optional[bool] = False,
 ) -> List:
     """Filter list of (Genome, Genome) comparisons for those not in the session db.
 

From c1b4be117e9ff0d2b211e2ed448ad40bfc4dc0dc Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:00:19 +0100
Subject: [PATCH 08/75] Add/expand code to process input genomes

Up to adding them to the database
---
 pyani/scripts/subcommands/subcmd_anib.py | 40 ++++++++++++++++--------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 6555b96e..97d9c905 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -43,16 +43,24 @@
 import json
 import logging
 import os
+import subprocess
 
 from argparse import Namespace
 from itertools import permutations
 from pathlib import Path
-from typing import List, NamedTuple, Tuple
+from typing import List, NamedTuple, Tuple, Dict
 
 from Bio import SeqIO
 from tqdm import tqdm
 
-from pyani import anib
+from pyani import (
+    PyaniException,
+    anib,
+    pyani_config,
+    pyani_jobs,
+    run_sge,
+    run_multiprocessing as run_mp,
+)
 from pyani.pyani_files import collect_existing_output
 from pyani.pyani_orm import (
     add_run,
@@ -61,7 +69,6 @@
     Comparison,
     filter_existing_comparisons,
     get_session,
-    pyani_jobs,
     PyaniORMException,
     update_comparison_matrices,
 )
@@ -181,24 +188,31 @@ def subcmd_anib(args: Namespace) -> None:
     os.makedirs(fragdir, exist_ok=True)
     os.makedirs(blastdbdir, exist_ok=True)
 
-    # jobgraph = anib.make_job_graph(
-    #    infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir)
-    # )
-
-    # Create dictionary of database building commands, keyed by dbname
-
     # Create a new sequence fragment file and a new BLAST+ database for each input genome,
     # and add this data to the database as a row in BlastDB
     logger.info("Creating input sequence fragment files")
+    fragfiles = {}
+    fraglens = {}
     for genome in genomes:
         fragpath, fragsizes = fragment_fasta_file(
             Path(str(genome.path)), Path(str(fragdir)), args.fragsize
         )
-        print(fragpath, len(fragsizes))
-
-        # blastdb = add_blastdb(session, genome, run, fragpath, dbpath, fragsizes, dbcmd)
+        fragfiles.update({genome: fragpath})
+        fraglens.update({genome: fragsizes})
+
+        dbcmd, blastdbpath = anib.construct_makeblastdb_cmd(
+            fragpath, blastdbdir
+        )  # args.outdir)
+
+        subprocess.run(
+            dbcmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=False,
+        )
 
-    raise NotImplementedError
+        add_blastdb(session, genome, run, fragpath, blastdbpath, fragsizes, dbcmd)
 
     # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples
     logger.info(

From 34f9ec31579a7ac1929844498e3970b82e36dc31 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:03:42 +0100
Subject: [PATCH 09/75] Split genome files into contiguous fragments

---
 pyani/scripts/subcommands/subcmd_anib.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 97d9c905..66e7cc32 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -267,18 +267,18 @@ def subcmd_anib(args: Namespace) -> None:
         existingfiles = list()
         logger.debug("\tAssuming no pre-existing output files")
 
-    # Split the input genome files into contiguous fragments of the specified size,
-    # as described in Goris et al. We create a new directory to hold sequence
-    # fragments, away from the main genomes
-    logger.info("Splitting input genome files into %snt fragments...", args.fragsize)
-    fragdir = Path(args.outdir) / "fragments"
-    os.makedirs(fragdir, exist_ok=True)
-    fragfiles, fraglens = anib.fragment_fasta_files(
-        [Path(str(_.path)) for _ in genomes],
-        Path(args.outdir) / "fragments",
-        args.fragsize,
-    )
-    logger.debug("...wrote %s fragment files to %s", len(fragfiles), fragdir)
+    ## Split the input genome files into contiguous fragments of the specified size,
+    ## as described in Goris et al. We create a new directory to hold sequence
+    ## fragments, away from the main genomes
+    # logger.info("Splitting input genome files into %snt fragments...", args.fragsize)
+    # fragdir = Path(args.outdir) / "fragments"
+    # os.makedirs(fragdir, exist_ok=True)
+    # fragfiles, fraglens = anib.fragment_fasta_files(
+    #     [Path(str(_.path)) for _ in genomes],
+    #     Path(args.outdir) / "fragments",
+    #     args.fragsize,
+    # )
+    # logger.debug("...wrote %s fragment files to %s", len(fragfiles), fragdir)
 
     # Create list of BLASTN jobs for each comparison still to be performed
     logger.info("Creating blastn jobs for ANIb...")

From 5cdd45298df4ff28c3bc04b27eb1f1e4237ee9cf Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:06:03 +0100
Subject: [PATCH 10/75] Implement `generate_joblist()`

---
 pyani/scripts/subcommands/subcmd_anib.py | 50 +++++++++++++++++++++---
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 66e7cc32..d7b3f63c 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -296,17 +296,57 @@ def generate_joblist(
     fragfiles: List,
     fragsizes: List,
     args: Namespace,
-) -> NotImplementedError:
+) -> List[ComparisonJob]:
     """Return list of ComparisonJobs.
 
     :param comparisons:  list of (Genome, Genome) tuples for which comparisons are needed
     :param existingfiles:  list of pre-existing BLASTN+ outputs
-    :param fragfiles:
-    :param fragsizes:
+    :param fragfiles:  list of files containing genome fragments
+    :param fragsizes:  list of fragment lengths
     :param args:  Namespace, command-line arguments
     """
-    # logger = logging.getLogger(__name__)
-    raise NotImplementedError
+    logger = logging.getLogger(__name__)
+
+    existingfiles = set(existingfiles)  # Path objects hashable
+
+    joblist = []  # will hold ComparisonJob structs
+    for idx, (query, subject) in enumerate(
+        tqdm(comparisons, disable=args.disable_tqdm)
+    ):
+        # fname1 = Path(query.path)
+        # fname2 = Path(subject.path)
+        blastcmd = anib.generate_blastn_commands(
+            Path(query.path), Path(subject.path), args.outdir, args.blastn_exe
+        )
+        logger.debug("Commands to run:\n\t%s\n", blastcmd)
+        outprefix = blastcmd.split()[2][:-10]  # prefix for blastn output
+        outfname = Path(outprefix + ".blast_tab")
+        logger.debug("Expected output file for db: %s", outfname)
+
+        # If e're in recovery mode, we don't want to repeat a computational
+        # comparison that already exists, so we check whether the ultimate
+        # output is in the set of existing files and, if not, we add the obs
+
+        # The comparisons collection always gets updated, so that results are
+        # added to the database whether they come from recovery mode or are run
+        # in this call of the script.
+        if args.recovery and outfname in existingfiles:
+            logger.debug("Recovering output from %s, not submitting job", outfname)
+            # Need to track the expected output, but set the job itself to None:
+            joblist.append(
+                ComparisonJob(query, subject, blastcmd, outfname, args.fragsize, None)
+            )
+        else:
+            logger.debug("Building job")
+            # Build job
+            blastjob = pyani_jobs.Job("%s_%06d-blast" % (args.jobprefix, idx), blastcmd)
+            joblist.append(
+                ComparisonJob(
+                    query, subject, blastcmd, outfname, args.fragsize, blastjob
+                )
+            )
+    return joblist
+    # raise NotImplementedError
 
 
 def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]:

From d78b4799fa458da3ee8e3b4ba6ae528a0ba2777d Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:08:08 +0100
Subject: [PATCH 11/75] Implement `run_anib_jobs()`

---
 pyani/scripts/subcommands/subcmd_anib.py | 78 ++++++++++++------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index d7b3f63c..448555f8 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -382,44 +382,46 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path
     return fragpath, json.dumps(sizedict)
 
 
-# def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
-#    """Pass ANIb blastn jobs to the scheduler.
-#
-#    :param joblist:           list of ComparisonJob namedtuples
-#    :param args:              command-line arguments for the run
-#    """
-#    logger = logging.getLogger(__name__)
-#    logger.debug("Scheduler: %s", args.scheduler)
-#
-#    # Entries with None seen in recovery mode:
-#    jobs = [_.job for _ in joblist if _.job]
-#
-#    if args.scheduler == "multiprocessing":
-#        logger.info("Running jobs with multiprocessing")
-#        if not args.workers:
-#            logger.debug("(using maximum number of worker threads)")
-#        else:
-#            logger.debug("(using %d worker threads, if available)", args.workers)
-#        cumval = run_mp.run_dependency_graph(jobs, workers=args.workers)
-#        if cumval > 0:
-#            logger.error(
-#                "At least one blastn comparison failed. Please investigate (exiting)"
-#            )
-#            raise PyaniException("Multiprocessing run failed in ANIb")
-#        logger.info("Multiprocessing run completed without error")
-#    elif args.scheduler.lower() == "sge":
-#        logger.info("Running jobs with SGE")
-#        logger.debug("Setting jobarray group size to %d", args.sgegroupsize)
-#        logger.debug("Joblist contains %d jobs", len(joblist))
-#        run_sge.run_dependency_graph(
-#            jobs,
-#            jgprefix=args.jobprefix,
-#            sgegroupsize=args.sgegroupsize,
-#            sgeargs=args.sgeargs,
-#        )
-#    else:
-#        logger.error(termcolor("Scheduler %s not recognised", "red"), args.scheduler)
-#        raise SystemError(1)
+def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
+    """Pass ANIb blastn jobs to the scheduler.
+
+    :param joblist:           list of ComparisonJob namedtuples
+    :param args:              command-line arguments for the run
+    """
+    logger = logging.getLogger(__name__)
+    logger.debug("Scheduler: %s", args.scheduler)
+
+    # Entries with None seen in recovery mode:
+    jobs = [_.job for _ in joblist if _.job]
+
+    if args.scheduler == "multiprocessing":
+        logger.info("Running jobs with multiprocessing")
+        if not args.workers:
+            logger.debug("(using maximum number of worker threads)")
+        else:
+            logger.debug("(using %d worker threads, if available)", args.workers)
+        cumval = run_mp.run_dependency_graph(jobs, workers=args.workers)
+        if cumval > 0:
+            logger.error(
+                "At least one blastn comparison failed. Please investigate (exiting)"
+            )
+            raise PyaniException("Multiprocessing run failed in ANIb")
+        logger.info("Multiprocessing run completed without error")
+    elif args.scheduler.lower() == "sge":
+        logger.info("Running jobs with SGE")
+        logger.debug("Setting jobarray group size to %d", args.sgegroupsize)
+        logger.debug("Joblist contains %d jobs", len(joblist))
+        run_sge.run_dependency_graph(
+            jobs,
+            jgprefix=args.jobprefix,
+            sgegroupsize=args.sgegroupsize,
+            sgeargs=args.sgeargs,
+        )
+    else:
+        logger.error(termcolor("Scheduler %s not recognised", "red"), args.scheduler)
+        raise SystemError(1)
+
+
 #
 #
 # def update_comparison_results(

From 0d42e79556c15d9af83a2507709f09febd12f8de Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:09:37 +0100
Subject: [PATCH 12/75] Implement `update_comparisons_results()` and commit to
 database

---
 pyani/scripts/subcommands/subcmd_anib.py | 87 +++++++++++++-----------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 448555f8..f58b2508 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -424,44 +424,49 @@ def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
 
 #
 #
-# def update_comparison_results(
-#    joblist: List[ComparisonJob], run, session, blastn_version: str, args: Namespace
-# ) -> None:
-#    """Update the Comparision table with the completed result set.
-#
-#    :param joblist:         list of ComparisonJob namedtuples
-#    :param run:             Run ORM object for the current ANIb run
-#    :param session:         active pyanidb session via ORM
-#    :param blastn_version:  version of blastn used for the comparison
-#    :param args:            command-line arguments for this run
-#
-#    The Comparison table stores individual comparison results, one per row.
-#    """
-#    logger = logging.getLogger(__name__)
-#
-#    # Add individual results to Comparison table
-#    for job in tqdm(joblist, disable=args.disable_tqdm):
-#        logger.debug("\t%s vs %s", job.query.description, job.subject.description)
-#        aln_length, sim_errs, ani_pid = anib.process_blast_tab(job.outfile)
-#        qcov = aln_length / job.query.length
-#        scov = aln_length / job.subject.length
-#        run.comparisons.append(
-#            Comparison(
-#                query=job.query,
-#                subject=job.subject,
-#                aln_length=aln_length,
-#                sim_errs=sim_errs,
-#                identity=ani_pid,
-#                cov_query=qcov,
-#                cov_subject=scov,
-#                program="blastn",
-#                version=blastn_version,
-#                fragsize=job.fragsize,
-#                maxmatch=False,
-#            )
-#        )
-#
-#    # Populate db
-#    logger.debug("Committing results to database")
-#    session.commit()
-#
+def update_comparison_results(
+    joblist: List[ComparisonJob],
+    run,
+    session,
+    blastn_version: str,
+    fraglens: Dict,
+    args: Namespace,
+) -> None:
+    """Update the Comparision table with the completed result set.
+    #
+       :param joblist:         list of ComparisonJob namedtuples
+       :param run:             Run ORM object for the current ANIb run
+       :param session:         active pyanidb session via ORM
+       :param blastn_version:  version of blastn used for the comparison
+       :param fraglens:     dictionary of fragment lengths for each genome
+       :param args:            command-line arguments for this run
+    #
+       The Comparison table stores individual comparison results, one per row.
+    """
+    logger = logging.getLogger(__name__)
+    #
+    # Add individual results to Comparison table
+    for job in tqdm(joblist, disable=args.disable_tqdm):
+        logger.debug("\t%s vs %s", job.query.description, job.subject.description)
+        aln_length, sim_errs, ani_pid = anib.parse_blast_tab(job.outfile, fraglens)
+        qcov = aln_length / job.query.length
+        scov = aln_length / job.subject.length
+        run.comparisons.append(
+            Comparison(
+                query=job.query,
+                subject=job.subject,
+                aln_length=aln_length,
+                sim_errs=sim_errs,
+                identity=ani_pid,
+                cov_query=qcov,
+                cov_subject=scov,
+                program="blastn",
+                version=blastn_version,
+                fragsize=job.fragsize,
+                maxmatch=False,
+            )
+        )
+    #
+    # Populate db
+    logger.debug("Committing results to database")
+    session.commit()

From 9e25141c541fc8915fb6ff0cceee39680b527f4e Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:11:13 +0100
Subject: [PATCH 13/75] Update call to `generate_joblist()`

Now passes the lists of all `fragfiles` and `fraglens`
---
 pyani/scripts/subcommands/subcmd_anib.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index f58b2508..39cdab25 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -282,8 +282,10 @@ def subcmd_anib(args: Namespace) -> None:
 
     # Create list of BLASTN jobs for each comparison still to be performed
     logger.info("Creating blastn jobs for ANIb...")
+    # This method considered, but doesn't get fraglens
+    # fragfiles =  [file for file in fragdir.iterdir()]
     joblist = generate_joblist(
-        comparisons_to_run, existingfiles, fragfiles, fraglens, args
+        comparisons_to_run, existingfiles, fragfiles.values(), fraglens.values(), args
     )
     logger.debug("...created %s blastn jobs", len(joblist))
 

From 1e8a47dabf5417f25c5f640190554196fca67e87 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:12:19 +0100
Subject: [PATCH 14/75] Update name of output file in `fragment_fasta_file()`

---
 pyani/scripts/subcommands/subcmd_anib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 39cdab25..70a0d951 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -379,7 +379,7 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path
             idx += fragsize
 
     # Write fragments to output file
-    fragpath = outdir / f"{inpath.stem}-fragments.fasta"
+    fragpath = outdir / f"{inpath.stem}-fragments.fna"
     SeqIO.write(outseqs, fragpath, "fasta")
     return fragpath, json.dumps(sizedict)
 

From b07a1dd59414024481176320caf91b8f22c60361 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:13:10 +0100
Subject: [PATCH 15/75] Update value passed for `maxmatch` to a boolean

Necessary for sqlite to recognise duplicate values
---
 pyani/scripts/subcommands/subcmd_anib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 70a0d951..8e6f8da6 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -226,7 +226,7 @@ def subcmd_anib(args: Namespace) -> None:
     # but remove it from the list of comparisons to be performed
     logger.info("Checking database for existing comparison data...")
     comparisons_to_run = filter_existing_comparisons(
-        session, run, comparisons, "blastn", blastn_version, args.fragsize, None
+        session, run, comparisons, "blastn", blastn_version, args.fragsize, False
     )
     logger.info(
         f"\t...after check, still need to run {len(comparisons_to_run)} comparisons"

From 8f4b131fb9a7be342ad361fbd331b52fcf5638c9 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:13:58 +0100
Subject: [PATCH 16/75] Implement remainder of `subcmd_anib()`

---
 pyani/scripts/subcommands/subcmd_anib.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 8e6f8da6..5d8e64b3 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -288,8 +288,21 @@ def subcmd_anib(args: Namespace) -> None:
         comparisons_to_run, existingfiles, fragfiles.values(), fraglens.values(), args
     )
     logger.debug("...created %s blastn jobs", len(joblist))
+    # print(f"Joblist: {joblist}")
+    # Pass jobs to appropriate scheduler
+    logger.debug("Passing %s jobs to %s...", len(joblist), args.scheduler)
+    run_anib_jobs(joblist, args)
+    logger.info("...jobs complete.")
+
+    # Process output and add results to database
+    # This requires us to drop out of threading/multiprocessing: Python's SQLite3
+    # interface doesn't allow sharing connections and cursors
+    logger.info("Adding comparison results to database...")
+    update_comparison_results(joblist, run, session, blastn_version, fraglens, args)
+    update_comparison_matrices(session, run)
+    logger.info("...database updated.")
 
-    raise NotImplementedError
+    # raise NotImplementedError
 
 
 def generate_joblist(

From c76f54c3a7fe88aed7d448f334c98ce57f143b5f Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:14:11 +0100
Subject: [PATCH 17/75] Add a commented question

---
 pyani/scripts/subcommands/subcmd_anib.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 5d8e64b3..abf1cda6 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -249,6 +249,7 @@ def subcmd_anib(args: Namespace) -> None:
     # run, and do not necessarily need to rerun all the jobs. In this case,
     # we prepare a list of output files we want to recover from the results
     # in the output directory.
+    # ¶ Should this use output files, or pull from the database?
     if args.recovery:
         logger.warning("Entering recovery mode...")
         logger.debug(

From c71ce5d58bfa890578c44d3994e545b506c7126d Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:17:14 +0100
Subject: [PATCH 18/75] Alter `generate_blastn_commands()` to only take one
 query/subject pair

---
 pyani/anib.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 89848e49..7bb28376 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -343,13 +343,15 @@ def construct_makeblastdb_cmd(
 
 # Generate list of BLASTN command lines from passed filenames
 def generate_blastn_commands(
-    filenames: List[Path],
+    query: Path,
+    subject: Path,
     outdir: Path,
     blast_exe: Optional[Path] = None,
 ) -> List[str]:
     """Return a list of blastn command-lines for ANIb.
 
-    :param filenames:  a list of paths to fragmented input FASTA files
+    :param query:  a paths to the query's fragmented input FASTA file
+    :param subject:  a paths to the subject's fragmented input FASTA file
     :param outdir:  path to output directory
     :param blastn_exe:  path to BLASTN executable
     :param method:  str, analysis type (ANIb)
@@ -359,22 +361,12 @@ def generate_blastn_commands(
     have the form ACCESSION.ext. This is the convention followed by the
     fragment_FASTA_files() function above.
     """
-    cmdlines = []
-    for idx, fname1 in enumerate(filenames[:-1]):
-        dbname1 = Path(str(fname1).replace("-fragments", ""))
-        for fname2 in filenames[idx + 1 :]:
-            dbname2 = Path(str(fname2).replace("-fragments", ""))
-            if blast_exe is None:
-                cmdlines.append(construct_blastn_cmdline(fname1, dbname2, outdir))
-                cmdlines.append(construct_blastn_cmdline(fname2, dbname1, outdir))
-            else:
-                cmdlines.append(
-                    construct_blastn_cmdline(fname1, dbname2, outdir, blast_exe)
-                )
-                cmdlines.append(
-                    construct_blastn_cmdline(fname2, dbname1, outdir, blast_exe)
-                )
-    return cmdlines
+    subj_db = outdir / "blastdbs" / Path(str(subject.name))
+    if blast_exe is None:
+        cmdline = construct_blastn_cmdline(query, subj_db, outdir)
+    else:
+        cmdline = construct_blastn_cmdline(query, subj_db, outdir, blast_exe)
+    return cmdline
 
 
 # Generate single BLASTN command line

From 585b98f119014fff57973dea5de24d0b237ec95d Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:18:20 +0100
Subject: [PATCH 19/75] Alter `construct_blastn_cmdline()` for a single
 query/subject pair

---
 pyani/anib.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 7bb28376..11a75b62 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -371,21 +371,21 @@ def generate_blastn_commands(
 
 # Generate single BLASTN command line
 def construct_blastn_cmdline(
-    fname1: Path,
-    fname2: Path,
+    query: Path,
+    subj_db: Path,
     outdir: Path,
     blastn_exe: Path = pyani_config.BLASTN_DEFAULT,
 ) -> str:
     """Return a single blastn command.
 
-    :param fname1:
-    :param fname2:
-    :param outdir:
+    :param fname1:  Path, FASTA file for query genome
+    :param fname2:  Path, database of fragments for subject genome
+    :param outdir:  Path, to the output directory
     :param blastn_exe:  str, path to blastn executable
     """
-    prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}"
+    prefix = outdir / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}"
     return (
-        f"{blastn_exe} -out {prefix}.blast_tab -query {fname1} -db {fname2} "
+        f"{blastn_exe} -out {prefix}.blast_tab -query {query} -db {subj_db} "
         "-xdrop_gap_final 150 -dust no -evalue 1e-15 -max_target_seqs 1 -outfmt "
         "'6 qseqid sseqid length mismatch pident nident qlen slen "
         "qstart qend sstart send positive ppos gaps' "

From 059ffdf6b3120de7cb3ac7f26fd5b6cbe91c4fba Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:18:49 +0100
Subject: [PATCH 20/75] Remove `method` parameter from `process_blast()` call

---
 pyani/anib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 11a75b62..b9c111b6 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -453,7 +453,7 @@ def process_blast(
                     blastfile,
                 )
             continue
-        resultvals = parse_blast_tab(blastfile, fraglengths, method)
+        resultvals = parse_blast_tab(blastfile, fraglengths)
         query_cover = float(resultvals[0]) / org_lengths[qname]
 
         # Populate dataframes: when assigning data, we need to note that

From aed81e1ebf9440a5df62f2b88235268082f57e25 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:20:00 +0100
Subject: [PATCH 21/75] Change `outfilename` in `construct_makeblastdb_cmd() `

---
 pyani/anib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index b9c111b6..3e8d9476 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -334,7 +334,7 @@ def construct_makeblastdb_cmd(
     :param outdir:  Path, directory for output
     :param blastdb_exe:  Path, path to the makeblastdb executable
     """
-    outfilename = outdir / filename.name
+    outfilename = outdir / filename.name.replace("-fragments", "")
     return (
         f"{blastdb_exe} -dbtype nucl -in {filename} -title {filename.stem} -out {outfilename}",
         outfilename,

From 0f0653ffd2b22ce5c2087d7a7bfec6419a508a0a Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:21:05 +0100
Subject: [PATCH 22/75] Move `aniblastall`-specific tests to
 `test_aniblastall.py`

---
 tests/test_anib.py        | 111 +---------------------
 tests/test_aniblastall.py | 189 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 190 insertions(+), 110 deletions(-)
 create mode 100644 tests/test_aniblastall.py

diff --git a/tests/test_anib.py b/tests/test_anib.py
index 4ba75099..b15423bd 100644
--- a/tests/test_anib.py
+++ b/tests/test_anib.py
@@ -119,65 +119,6 @@ def anib_output_dir(dir_anib_in):
     )
 
 
-# Test legacy BLAST (blastall) command generation
-def test_blastall_dbjobdict(path_fna_all, tmp_path):
-    """Generate dictionary of legacy BLASTN database jobs."""
-    blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path)
-    jobdict = anib.build_db_jobs(path_fna_all, blastcmds)
-    expected = [
-        (tmp_path / _.name, f"formatdb -p F -i {tmp_path / _.name} -t {_.stem}")
-        for _ in path_fna_all
-    ]
-    assert sorted([(k, v.script) for (k, v) in jobdict.items()]) == sorted(expected)
-
-
-def test_blastall_graph(path_fna_all, tmp_path, fragment_length):
-    """Create jobgraph for legacy BLASTN jobs."""
-    fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length)
-    blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path)
-    jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds)
-    # We check that the main script job is a blastn job, and that there
-    # is a single dependency, which is a makeblastdb job
-    for job in jobgraph:
-        assert job.script.startswith("blastall -p blastn")
-        assert len(job.dependencies) == 1
-        assert job.dependencies[0].script.startswith("formatdb")
-
-
-def test_blastall_multiple(path_fna_two, tmp_path):
-    """Generate legacy BLASTN commands."""
-    cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIblastall")
-    expected = [
-        (
-            "blastall -p blastn -o "
-            f"{tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} "
-            f"-i {path_fna_two[0]} "
-            f"-d {path_fna_two[1]} "
-            "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
-        ),
-        (
-            "blastall -p blastn -o "
-            f"{tmp_path / str(path_fna_two[1].stem + '_vs_' + path_fna_two[0].stem + '.blast_tab')} "
-            f"-i {path_fna_two[1]} "
-            f"-d {path_fna_two[0]} "
-            "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
-        ),
-    ]
-    assert cmds == expected
-
-
-def test_blastall_single(path_fna_two, tmp_path):
-    """Generate legacy BLASTN command-line."""
-    cmd = anib.construct_blastall_cmdline(path_fna_two[0], path_fna_two[1], tmp_path)
-    expected = (
-        f"blastall -p blastn -o {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} "
-        f"-i {path_fna_two[0]} "
-        f"-d {path_fna_two[1]} "
-        "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
-    )
-    assert cmd == expected
-
-
 # Test BLAST+ (blastn) command generation
 def test_blastn_dbjobdict(path_fna_all, tmp_path):
     """Generate dictionary of BLASTN+ database jobs."""
@@ -248,30 +189,6 @@ def test_blastn_single(path_fna_two, tmp_path):
     assert cmd == expected
 
 
-# Test legacy BLAST database formatting (formatdb) command generation
-def test_formatdb_multiple(path_fna_two, tmp_path):
-    """Generate legacy BLAST db creation commands."""
-    cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIblastall")
-    expected = [
-        (
-            f"formatdb -p F -i {tmp_path / path_fna_two[0].name} -t {path_fna_two[0].stem}",
-            tmp_path / path_fna_two[0].name,
-        ),
-        (
-            f"formatdb -p F -i {tmp_path / path_fna_two[1].name} -t {path_fna_two[1].stem}",
-            tmp_path / path_fna_two[1].name,
-        ),
-    ]
-    assert cmds == expected
-
-
-def test_formatdb_single(path_fna, tmp_path):
-    """Generate legacy BLAST formatdb command-line."""
-    cmd = anib.construct_formatdb_cmd(path_fna, tmp_path)
-    expected = f"formatdb -p F -i {tmp_path / path_fna.name} -t {path_fna.stem}"
-    assert cmd[0] == expected
-
-
 # Test FASTA file fragmentation for ANIb methods
 def test_fragment_files(path_fna_all, tmp_path, dir_tgt_fragments, fragment_length):
     """Fragment files for ANIb/ANIblastall."""
@@ -317,26 +234,11 @@ def test_makeblastdb_single(path_fna, tmp_path):
 
 
 # Test output file parsing for ANIb methods
-def test_parse_legacy_blastdir(anib_output_dir):
-    """Parses directory of legacy BLAST output."""
-    orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles)
-    fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles)
-    result = anib.process_blast(
-        anib_output_dir.legacyblastdir, orglengths, fraglengths, mode="ANIblastall"
-    )
-    assert_frame_equal(
-        result.percentage_identity.sort_index(1).sort_index(),
-        anib_output_dir.legacyblastresult.sort_index(1).sort_index(),
-    )
-
-
 def test_parse_blastdir(anib_output_dir):
     """Parse directory of BLAST+ output."""
     orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles)
     fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles)
-    result = anib.process_blast(
-        anib_output_dir.blastdir, orglengths, fraglengths, mode="ANIb"
-    )
+    result = anib.process_blast(anib_output_dir.blastdir, orglengths, fraglengths)
     assert_frame_equal(
         result.percentage_identity.sort_index(1).sort_index(),
         anib_output_dir.blastresult.sort_index(1).sort_index(),
@@ -348,14 +250,3 @@ def test_parse_blasttab(anib_output):
     fragdata = anib.get_fraglength_dict([anib_output.fragfile])
     result = anib.parse_blast_tab(anib_output.tabfile, fragdata, mode="ANIb")
     assert (a == b for a, b in zip(result, [4_016_551, 93, 99.997_693_577_050_029]))
-
-
-def test_parse_legacy_blasttab(anib_output):
-    """Parses ANIB legacy .blast_tab output."""
-    fragdata = anib.get_fraglength_dict([anib_output.fragfile])
-    result = anib.parse_blast_tab(
-        anib_output.legacytabfile, fragdata, mode="ANIblastall"
-    )
-    assert (
-        a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018])
-    )
diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py
new file mode 100644
index 00000000..339c9263
--- /dev/null
+++ b/tests/test_aniblastall.py
@@ -0,0 +1,189 @@
+"""Test anib.py module.
+
+These tests are intended to be run from the repository root using:
+
+pytest -v
+"""
+
+from pathlib import Path
+from typing import List, NamedTuple
+
+import pandas as pd
+import pytest  # noqa: F401  # pylint: disable=unused-import
+
+from pandas.util.testing import assert_frame_equal
+
+from pyani import anib, pyani_files
+
+
+class ANIbOutput(NamedTuple):
+
+    """Convenience struct for ANIb output."""
+
+    fragfile: Path
+    tabfile: Path
+    legacytabfile: Path
+
+
+class ANIbOutputDir(NamedTuple):
+
+    """Convenience struct for ANIb output."""
+
+    infiles: List[Path]
+    fragfiles: List[Path]
+    blastdir: Path
+    legacyblastdir: Path
+    blastresult: pd.DataFrame
+    legacyblastresult: pd.DataFrame
+
+
+@pytest.fixture
+def anib_output(dir_anib_in):
+    """Namedtuple of example ANIb output.
+
+    fragfile - fragmented FASTA query file
+    tabfile  - BLAST+ tabular output
+    legacytabfile - blastall tabular output
+    """
+    return ANIbOutput(
+        dir_anib_in / "NC_002696-fragments.fna",
+        dir_anib_in / "NC_002696_vs_NC_011916.blast_tab",
+        dir_anib_in / "NC_002696_vs_NC_010338.blast_tab",
+    )
+
+
+@pytest.fixture
+def anib_output_dir(dir_anib_in):
+    """Namedtuple of example ANIb output - full directory.
+
+    infiles - list of FASTA query files
+    fragfiles - list of fragmented FASTA query files
+    blastdir - path to BLAST+ output data
+    legacyblastdir - path to blastall output data
+    blastresult - pd.DataFrame result for BLAST+
+    legacyblastresult - pd.DataFrame result for blastall
+    """
+    return ANIbOutputDir(
+        [
+            _
+            for _ in (dir_anib_in / "sequences").iterdir()
+            if _.is_file() and _.suffix == ".fna"
+        ],
+        [
+            _
+            for _ in (dir_anib_in / "fragfiles").iterdir()
+            if _.is_file() and _.suffix == ".fna"
+        ],
+        dir_anib_in / "blastn",
+        dir_anib_in / "blastall",
+        pd.read_csv(dir_anib_in / "dataframes" / "blastn_result.csv", index_col=0),
+        pd.read_csv(dir_anib_in / "dataframes" / "blastall_result.csv", index_col=0),
+    )
+
+
+# Test legacy BLAST (blastall) command generation
+def test_blastall_dbjobdict(path_fna_all, tmp_path):
+    """Generate dictionary of legacy BLASTN database jobs."""
+    blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path)
+    jobdict = anib.build_db_jobs(path_fna_all, blastcmds)
+    expected = [
+        (tmp_path / _.name, f"formatdb -p F -i {tmp_path / _.name} -t {_.stem}")
+        for _ in path_fna_all
+    ]
+    assert sorted([(k, v.script) for (k, v) in jobdict.items()]) == sorted(expected)
+
+
+def test_blastall_graph(path_fna_all, tmp_path, fragment_length):
+    """Create jobgraph for legacy BLASTN jobs."""
+    fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length)
+    blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path)
+    jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds)
+    # We check that the main script job is a blastn job, and that there
+    # is a single dependency, which is a makeblastdb job
+    for job in jobgraph:
+        assert job.script.startswith("blastall -p blastn")
+        assert len(job.dependencies) == 1
+        assert job.dependencies[0].script.startswith("formatdb")
+
+
+def test_blastall_multiple(path_fna_two, tmp_path):
+    """Generate legacy BLASTN commands."""
+    cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIblastall")
+    expected = [
+        (
+            "blastall -p blastn -o "
+            f"{tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} "
+            f"-i {path_fna_two[0]} "
+            f"-d {path_fna_two[1]} "
+            "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
+        ),
+        (
+            "blastall -p blastn -o "
+            f"{tmp_path / str(path_fna_two[1].stem + '_vs_' + path_fna_two[0].stem + '.blast_tab')} "
+            f"-i {path_fna_two[1]} "
+            f"-d {path_fna_two[0]} "
+            "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
+        ),
+    ]
+    assert cmds == expected
+
+
+def test_blastall_single(path_fna_two, tmp_path):
+    """Generate legacy BLASTN command-line."""
+    cmd = anib.construct_blastall_cmdline(path_fna_two[0], path_fna_two[1], tmp_path)
+    expected = (
+        f"blastall -p blastn -o {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} "
+        f"-i {path_fna_two[0]} "
+        f"-d {path_fna_two[1]} "
+        "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
+    )
+    assert cmd == expected
+
+
+# Test legacy BLAST database formatting (formatdb) command generation
+def test_formatdb_multiple(path_fna_two, tmp_path):
+    """Generate legacy BLAST db creation commands."""
+    cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIblastall")
+    expected = [
+        (
+            f"formatdb -p F -i {tmp_path / path_fna_two[0].name} -t {path_fna_two[0].stem}",
+            tmp_path / path_fna_two[0].name,
+        ),
+        (
+            f"formatdb -p F -i {tmp_path / path_fna_two[1].name} -t {path_fna_two[1].stem}",
+            tmp_path / path_fna_two[1].name,
+        ),
+    ]
+    assert cmds == expected
+
+
+def test_formatdb_single(path_fna, tmp_path):
+    """Generate legacy BLAST formatdb command-line."""
+    cmd = anib.construct_formatdb_cmd(path_fna, tmp_path)
+    expected = f"formatdb -p F -i {tmp_path / path_fna.name} -t {path_fna.stem}"
+    assert cmd[0] == expected
+
+
+# Test output file parsing for ANIb methods
+def test_parse_legacy_blastdir(anib_output_dir):
+    """Parses directory of legacy BLAST output."""
+    orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles)
+    fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles)
+    result = anib.process_blast(
+        anib_output_dir.legacyblastdir, orglengths, fraglengths, mode="ANIblastall"
+    )
+    assert_frame_equal(
+        result.percentage_identity.sort_index(1).sort_index(),
+        anib_output_dir.legacyblastresult.sort_index(1).sort_index(),
+    )
+
+
+def test_parse_legacy_blasttab(anib_output):
+    """Parses ANIB legacy .blast_tab output."""
+    fragdata = anib.get_fraglength_dict([anib_output.fragfile])
+    result = anib.parse_blast_tab(
+        anib_output.legacytabfile, fragdata, mode="ANIblastall"
+    )
+    assert (
+        a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018])
+    )

From fc273df93d786038b6780fcb587ceb5d01b9ff2c Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:21:55 +0100
Subject: [PATCH 23/75] Change call to `parse_blast_tab()` to not use `method`
 parameter

---
 tests/test_anib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_anib.py b/tests/test_anib.py
index b15423bd..4267697b 100644
--- a/tests/test_anib.py
+++ b/tests/test_anib.py
@@ -248,5 +248,5 @@ def test_parse_blastdir(anib_output_dir):
 def test_parse_blasttab(anib_output):
     """Parse ANIb BLAST+ .blast_tab output."""
     fragdata = anib.get_fraglength_dict([anib_output.fragfile])
-    result = anib.parse_blast_tab(anib_output.tabfile, fragdata, mode="ANIb")
+    result = anib.parse_blast_tab(anib_output.tabfile, fragdata)
     assert (a == b for a, b in zip(result, [4_016_551, 93, 99.997_693_577_050_029]))

From cc4d3465798e9df5612d1c2dc133893d5a72c1b3 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:24:23 +0100
Subject: [PATCH 24/75] Skip tests as a result of changes to how command lines
 are generated

Currently, we don't pass a list of genome pairs to `anib.py`.
---
 tests/test_anib.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_anib.py b/tests/test_anib.py
index 4267697b..cc6d73d4 100644
--- a/tests/test_anib.py
+++ b/tests/test_anib.py
@@ -147,6 +147,7 @@ def test_blastn_graph(path_fna_all, tmp_path, fragment_length):
         assert job.dependencies[0].script.startswith("makeblastdb")
 
 
+@pytest.mark.skip(reason="unsure this is needed")
 def test_blastn_multiple(path_fna_two, tmp_path):
     """Generate BLASTN+ commands."""
     # BLAST+
@@ -201,6 +202,7 @@ def test_fragment_files(path_fna_all, tmp_path, dir_tgt_fragments, fragment_leng
 
 
 # Test BLAST+ database formatting (makeblastdb) command generation
+@pytest.mark.skip(reason="unsure this is needed")
 def test_makeblastdb_multiple(path_fna_two, tmp_path):
     """Generate multiple BLAST+ makeblastdb command-lines."""
     cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIb")

From 0e3d471c4c52a07ab3bf6d29f95857d8c2588ba0 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 00:35:36 +0100
Subject: [PATCH 25/75] Add `get_version()` tests and boilerplate

---
 tests/test_aniblastall.py | 81 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py
index 339c9263..2807ea35 100644
--- a/tests/test_aniblastall.py
+++ b/tests/test_aniblastall.py
@@ -1,7 +1,42 @@
-"""Test anib.py module.
-
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# (c) University of Strathclyde 2019-2021
+# Author: Bailey Harrington
+#
+# Contact:
+# leighton.pritchard@strath.ac.uk
+#
+# Leighton Pritchard,
+# Strathclyde Institute for Pharmacy and Biomedical Sciences,
+# 161 Cathedral Street,
+# Glasgow,
+# G4 0RE
+# Scotland,
+# UK
+#
+# The MIT License
+#
+# Copyright (c) 2019-2021 University of Strathclyde
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""Test aniblastall.py module.
 These tests are intended to be run from the repository root using:
-
 pytest -v
 """
 
@@ -14,6 +49,7 @@
 from pandas.util.testing import assert_frame_equal
 
 from pyani import anib, pyani_files
+from pyani import aniblastall
 
 
 class ANIbOutput(NamedTuple):
@@ -81,6 +117,45 @@ def anib_output_dir(dir_anib_in):
     )
 
 
+# Test get_version()
+# Test case 1: there is no executable
+def test_get_version_missing_exe(executable_missing):
+    """Test behaviour when there is no file at the specified executable location."""
+    test_file_1 = Path("/non/existent/blastall")
+    assert aniblastall.get_version(test_file_1) == f"No blastall at {test_file_1}"
+
+
+# Test case 2: there is a file, but it is not executable
+def test_get_version_not_executable(executable_not_executable):
+    """Test behaviour when the file at the executable location is not executable."""
+    test_file_2 = Path("/non/executable/blastall")
+    assert (
+        aniblastall.get_version(test_file_2)
+        == f"blastall exists at {test_file_2} but not executable"
+    )
+
+
+# Test case 3: there is an executable file, but the version can't be retrieved
+def test_get_version_no_version(executable_without_version):
+    """Test behaviour when the version for the executable can not be retrieved."""
+    test_file_3 = Path("/missing/version/blastall")
+    assert (
+        aniblastall.get_version(test_file_3)
+        == f"blastall exists at {test_file_3} but could not retrieve version"
+    )
+
+
+# Test case 4: there is an executable file, but it will not run on the OS
+def test_get_version_os_incompatible(executable_incompatible_with_os):
+    """Test behaviour when the program can't run on the operating system.
+    This will happen with newer versions of MacOS."""
+    test_file_4 = Path("/os/incompatible/blastall")
+    assert (
+        aniblastall.get_version(test_file_4)
+        == f"blastall exists at {test_file_4} but could not be executed"
+    )
+
+
 # Test legacy BLAST (blastall) command generation
 def test_blastall_dbjobdict(path_fna_all, tmp_path):
     """Generate dictionary of legacy BLASTN database jobs."""

From 8f94ba05c4a16d2d16f841d62efa0d70bbfabf2b Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 14 Sep 2021 21:52:27 +0100
Subject: [PATCH 26/75] Make changes to reflect splitting of `anib` and
 `aniblastall`

- remove `mode` or `method` arguments in function calls/definitions
- change references to module names from `anib` to `aniblastall` where
necessary
---
 pyani/anib.py                                |   6 +-
 pyani/aniblastall.py                         | 125 +++++++++++++++++--
 pyani/scripts/average_nucleotide_identity.py |  15 ++-
 tests/test_anib.py                           |   8 +-
 tests/test_aniblastall.py                    |  34 ++---
 tests/test_concordance.py                    |  16 +--
 tests/test_multiprocessing.py                |   2 +-
 7 files changed, 157 insertions(+), 49 deletions(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 86f23996..4b5ec3e3 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -246,7 +246,7 @@ def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict:
 
 
 def make_blastcmd_builder(
-    method: str,
+    # method: str,
     outdir: Path,
     format_exe: Optional[Path] = None,
     blast_exe: Optional[Path] = None,
@@ -405,7 +405,9 @@ def construct_blastn_cmdline(
     :param outdir:  Path, to the output directory
     :param blastn_exe:  str, path to blastn executable
     """
-    prefix = outdir / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}"
+    prefix = Path(outdir) / Path(
+        f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}"
+    )
     return (
         f"{blastn_exe} -out {prefix}.blast_tab -query {query} -db {subj_db} "
         "-xdrop_gap_final 150 -dust no -evalue 1e-15 -max_target_seqs 1 -outfmt "
diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index b77a59ea..24c9f901 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -41,6 +41,7 @@
 import re
 import shutil
 import subprocess
+from logging import Logger
 
 import pandas as pd
 
@@ -51,10 +52,11 @@
 
 from . import pyani_config
 from . import pyani_jobs
-from .pyani_tools import BLASTcmds, BLASTfunctions, BLASTexes
+from .pyani_tools import BLASTcmds, BLASTfunctions, BLASTexes, ANIResults
+from pyani import pyani_files
 
 
-def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str:
+def get_version(blastall_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str:
     r"""Return BLAST blastall version as a string.
 
     :param blast_exe:  path to blastall executable
@@ -76,18 +78,46 @@ def get_version(blast_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str:
     - no version info returned
     - executable cannot be run on this OS
     """
-    cmdline = [blast_exe, "-version"]
-    result = subprocess.run(
-        cmdline,  # type: ignore
-        shell=False,
-        stdout=subprocess.PIPE,  # type: ignore
-        stderr=subprocess.PIPE,
-        check=False,  # blastall doesn't return 0
-    )
+    logger = logging.getLogger(__name__)
+
+    try:
+        blastall_path = Path(shutil.which(blastall_exe))  # type:ignore
+    except TypeError:
+        return f"{blastall_exe} is not found in $PATH"
+
+    if not blastall_path.is_file():  # no executable
+        return f"No blastall at {blastall_path}"
+
+    # This should catch cases when the file can't be executed by the user
+    if not os.access(blastall_path, os.X_OK):  # file exists but not executable
+        return f"blastall exists at {blastall_path} but not executable"
+
+    if platform.system() == "Darwin":
+        cmdline = [blastall_exe, "-version"]
+    else:
+        cmdline = [blastall_exe]
+
+    try:
+        result = subprocess.run(
+            cmdline,  # type: ignore
+            shell=False,
+            stdout=subprocess.PIPE,  # type: ignore
+            stderr=subprocess.PIPE,
+            check=False,  # blastall doesn't return 0
+        )
+
+    except OSError:
+        logger.warning("blastall executable will not run", exc_info=True)
+        return f"blastall exists at {blastall_path} but could not be executed"
+
     version = re.search(  # type: ignore
         r"(?<=blastall\s)[0-9\.]*", str(result.stderr, "utf-8")
     ).group()
-    return f"{platform.system()}_{version}"
+
+    if 0 == len(version.strip()):
+        return f"blastall exists at {blastall_path} but could not retrieve version"
+
+    return f"{platform.system()}_{version} ({blastall_path})"
 
 
 # Divide input FASTA sequences into fragments
@@ -185,7 +215,7 @@ def build_db_jobs(infiles: List[Path], blastcmds: BLASTcmds) -> Dict:
 
 
 def make_blastcmd_builder(
-    method: str,
+    # method: str,
     outdir: Path,
     format_exe: Optional[Path] = None,
     blast_exe: Optional[Path] = None,
@@ -268,6 +298,7 @@ def generate_blastdb_commands(
     filenames: List[Path],
     outdir: Path,
     blastdb_exe: Optional[Path] = None,
+    # mode: "ANIblastall"
 ) -> List[Tuple[str, Path]]:
     """Return list of makeblastdb command-lines for ANIblastall.
 
@@ -295,7 +326,7 @@ def construct_formatdb_cmd(
     :param outdir:  Path, path to output directory
     :param blastdb_exe:  Path, path to the formatdb executable
     """
-    newfilename = outdir / filename.name
+    newfilename = Path(outdir) / Path(filename.name)
     shutil.copy(filename, newfilename)
     return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename)
 
@@ -436,3 +467,71 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
     sim_errors = filtered["blast_mismatch"].sum() + filtered["blast_gaps"].sum()
     filtered.to_csv(Path(filename).with_suffix(".blast_tab.dataframe"), sep="\t")
     return aln_length, sim_errors, ani_pid
+
+
+def process_blast(
+    blast_dir: Path,
+    org_lengths: Dict,
+    fraglengths: Dict,
+    mode: str = "ANIblastall",
+    logger: Optional[Logger] = None,
+) -> ANIResults:
+    """Return tuple of ANIb results for .blast_tab files in the output dir.
+    :param blast_dir:  Path, path to the directory containing .blast_tab files
+    :param org_lengths:  Dict, the base count for each input sequence
+    :param fraglengths:  dictionary of query sequence fragment lengths, only
+        needed for BLASTALL output
+    :param mode:  str, analysis type (ANIb or ANIblastall)
+    :param logger:  a logger for messages
+    Returns the following pandas dataframes in an ANIResults object;
+    query sequences are rows, subject sequences are columns:
+    - alignment_lengths - non-symmetrical: total length of alignment
+    - percentage_identity - non-symmetrical: ANIb (Goris) percentage identity
+    - alignment_coverage - non-symmetrical: coverage of query
+    - similarity_errors - non-symmetrical: count of similarity errors
+    May throw a ZeroDivisionError if one or more BLAST runs failed, or a
+    very distant sequence was included in the analysis.
+    """
+    # Process directory to identify input files
+    blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab")
+    # Hold data in ANIResults object
+    results = ANIResults(list(org_lengths.keys()), mode)
+
+    # Fill diagonal NA values for alignment_length with org_lengths
+    for org, length in list(org_lengths.items()):
+        results.alignment_lengths[org][org] = length
+
+    # Process .blast_tab files assuming that the filename format holds:
+    # org1_vs_org2.blast_tab:
+    for blastfile in blastfiles:
+        qname, sname = blastfile.stem.split("_vs_")
+
+        # We may have BLAST files from other analyses in the same directory
+        # If this occurs, we raise a warning, and skip the file
+        if qname not in list(org_lengths.keys()):
+            if logger:
+                logger.warning(
+                    "Query name %s not in input sequence list, skipping %s",
+                    qname,
+                    blastfile,
+                )
+            continue
+        if sname not in list(org_lengths.keys()):
+            if logger:
+                logger.warning(
+                    "Subject name %s not in input sequence list, skipping %s",
+                    sname,
+                    blastfile,
+                )
+            continue
+        resultvals = parse_blast_tab(blastfile, fraglengths)
+        query_cover = float(resultvals[0]) / org_lengths[qname]
+
+        # Populate dataframes: when assigning data, we need to note that
+        # we have asymmetrical data from BLAST output, so only the
+        # upper triangle is populated
+        results.add_tot_length(qname, sname, resultvals[0], sym=False)
+        results.add_sim_errors(qname, sname, resultvals[1], sym=False)
+        results.add_pid(qname, sname, 0.01 * resultvals[2], sym=False)
+        results.add_coverage(qname, sname, query_cover)
+    return results
diff --git a/pyani/scripts/average_nucleotide_identity.py b/pyani/scripts/average_nucleotide_identity.py
index 10fc7767..44f6f26f 100755
--- a/pyani/scripts/average_nucleotide_identity.py
+++ b/pyani/scripts/average_nucleotide_identity.py
@@ -141,7 +141,7 @@
 import pandas as pd
 
 from pyani import (
-    anib,
+    # anib,
     anim,
     tetra,
     pyani_config,
@@ -627,6 +627,7 @@ def make_sequence_fragments(
     for ANIb methods), and writes BLAST databases of these fragments,
     and fragment lengths of sequences, to local files.
     """
+    global anib
     fragfiles, fraglengths = anib.fragment_fasta_files(infiles, blastdir, args.fragsize)
     # Export fragment lengths as JSON, in case we re-run with --skip_blastn
     fragpath = blastdir / "fraglengths.json"
@@ -659,7 +660,7 @@ def run_blast(
         # Run BLAST database-building and executables from a jobgraph
         logger.info("Creating job dependency graph")
         jobgraph = anib.make_job_graph(
-            infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir)
+            infiles, fragfiles, anib.make_blastcmd_builder(blastdir)
         )
         if args.scheduler == "multiprocessing":
             logger.info("Running dependency graph with multiprocessing")
@@ -740,9 +741,7 @@ def unified_anib(
     # Process pairwise BLASTN output
     logger.info("Processing pairwise %s BLAST output.", args.method)
     try:
-        data = anib.process_blast(
-            blastdir, org_lengths, fraglengths=fraglengths, mode=args.method
-        )
+        data = anib.process_blast(blastdir, org_lengths, fraglengths=fraglengths)
     except ZeroDivisionError:
         logger.error("One or more BLAST output files has a problem.")
         if not args.skip_blastn:
@@ -958,6 +957,12 @@ def run_main(argsin: Optional[Namespace] = None) -> int:
     logger = logging.getLogger(__name__)
     config_logger(args)
 
+    # Conditionally load correct blast module
+    if args.method == "ANIb":
+        globals()["anib"] = __import__("pyani.anib", fromlist="pyani")
+    elif args.method == "ANIblastall":
+        globals()["anib"] = __import__("pyani.aniblastall", fromlist="pyani")
+
     # Ensure argument validity and get method function/config
     test_class_label_paths(args, logger)
     test_scheduler(args, logger)
diff --git a/tests/test_anib.py b/tests/test_anib.py
index 874c3bf4..1710e84f 100644
--- a/tests/test_anib.py
+++ b/tests/test_anib.py
@@ -124,7 +124,7 @@ def anib_output_dir(dir_anib_in):
 # Test BLAST+ (blastn) command generation
 def test_blastn_dbjobdict(path_fna_all, tmp_path):
     """Generate dictionary of BLASTN+ database jobs."""
-    blastcmds = anib.make_blastcmd_builder("ANIb", tmp_path)
+    blastcmds = anib.make_blastcmd_builder(tmp_path)
     jobdict = anib.build_db_jobs(path_fna_all, blastcmds)
     expected = [
         (
@@ -139,7 +139,7 @@ def test_blastn_dbjobdict(path_fna_all, tmp_path):
 def test_blastn_graph(path_fna_all, tmp_path, fragment_length):
     """Create jobgraph for BLASTN+ jobs."""
     fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length)
-    blastcmds = anib.make_blastcmd_builder("ANIb", tmp_path)
+    blastcmds = anib.make_blastcmd_builder(tmp_path)
     jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds)
     # We check that the main script job is a blastn job, and that there
     # is a single dependency, which is a makeblastdb job
@@ -153,7 +153,7 @@ def test_blastn_graph(path_fna_all, tmp_path, fragment_length):
 def test_blastn_multiple(path_fna_two, tmp_path):
     """Generate BLASTN+ commands."""
     # BLAST+
-    cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIb")
+    cmds = anib.generate_blastn_commands(path_fna_two, tmp_path)
     expected = [
         (
             f"blastn -out {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} "
@@ -207,7 +207,7 @@ def test_fragment_files(path_fna_all, tmp_path, dir_tgt_fragments, fragment_leng
 @pytest.mark.skip(reason="unsure this is needed")
 def test_makeblastdb_multiple(path_fna_two, tmp_path):
     """Generate multiple BLAST+ makeblastdb command-lines."""
-    cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIb")
+    cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path)
     expected = [
         (
             (
diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py
index 70ae8108..5c5c6da6 100644
--- a/tests/test_aniblastall.py
+++ b/tests/test_aniblastall.py
@@ -161,8 +161,8 @@ def test_get_version_os_incompatible(executable_incompatible_with_os):
 # Test legacy BLAST (blastall) command generation
 def test_blastall_dbjobdict(path_fna_all, tmp_path):
     """Generate dictionary of legacy BLASTN database jobs."""
-    blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path)
-    jobdict = anib.build_db_jobs(path_fna_all, blastcmds)
+    blastcmds = aniblastall.make_blastcmd_builder(tmp_path)
+    jobdict = aniblastall.build_db_jobs(path_fna_all, blastcmds)
     expected = [
         (tmp_path / _.name, f"formatdb -p F -i {tmp_path / _.name} -t {_.stem}")
         for _ in path_fna_all
@@ -172,9 +172,11 @@ def test_blastall_dbjobdict(path_fna_all, tmp_path):
 
 def test_blastall_graph(path_fna_all, tmp_path, fragment_length):
     """Create jobgraph for legacy BLASTN jobs."""
-    fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length)
-    blastcmds = anib.make_blastcmd_builder("ANIblastall", tmp_path)
-    jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds)
+    fragresult = aniblastall.fragment_fasta_files(
+        path_fna_all, tmp_path, fragment_length
+    )
+    blastcmds = aniblastall.make_blastcmd_builder(tmp_path)
+    jobgraph = aniblastall.make_job_graph(path_fna_all, fragresult[0], blastcmds)
     # We check that the main script job is a blastn job, and that there
     # is a single dependency, which is a makeblastdb job
     for job in jobgraph:
@@ -185,7 +187,7 @@ def test_blastall_graph(path_fna_all, tmp_path, fragment_length):
 
 def test_blastall_multiple(path_fna_two, tmp_path):
     """Generate legacy BLASTN commands."""
-    cmds = anib.generate_blastn_commands(path_fna_two, tmp_path, mode="ANIblastall")
+    cmds = aniblastall.generate_blastn_commands(path_fna_two, tmp_path)
     expected = [
         (
             "blastall -p blastn -o "
@@ -207,7 +209,9 @@ def test_blastall_multiple(path_fna_two, tmp_path):
 
 def test_blastall_single(path_fna_two, tmp_path):
     """Generate legacy BLASTN command-line."""
-    cmd = anib.construct_blastall_cmdline(path_fna_two[0], path_fna_two[1], tmp_path)
+    cmd = aniblastall.construct_blastall_cmdline(
+        path_fna_two[0], path_fna_two[1], tmp_path
+    )
     expected = (
         f"blastall -p blastn -o {tmp_path / str(path_fna_two[0].stem + '_vs_' + path_fna_two[1].stem + '.blast_tab')} "
         f"-i {path_fna_two[0]} "
@@ -220,7 +224,7 @@ def test_blastall_single(path_fna_two, tmp_path):
 # Test legacy BLAST database formatting (formatdb) command generation
 def test_formatdb_multiple(path_fna_two, tmp_path):
     """Generate legacy BLAST db creation commands."""
-    cmds = anib.generate_blastdb_commands(path_fna_two, tmp_path, mode="ANIblastall")
+    cmds = aniblastall.generate_blastdb_commands(path_fna_two, tmp_path)
     expected = [
         (
             f"formatdb -p F -i {tmp_path / path_fna_two[0].name} -t {path_fna_two[0].stem}",
@@ -236,7 +240,7 @@ def test_formatdb_multiple(path_fna_two, tmp_path):
 
 def test_formatdb_single(path_fna, tmp_path):
     """Generate legacy BLAST formatdb command-line."""
-    cmd = anib.construct_formatdb_cmd(path_fna, tmp_path)
+    cmd = aniblastall.construct_formatdb_cmd(path_fna, tmp_path)
     expected = f"formatdb -p F -i {tmp_path / path_fna.name} -t {path_fna.stem}"
     assert cmd[0] == expected
 
@@ -245,9 +249,9 @@ def test_formatdb_single(path_fna, tmp_path):
 def test_parse_legacy_blastdir(anib_output_dir):
     """Parses directory of legacy BLAST output."""
     orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles)
-    fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles)
-    result = anib.process_blast(
-        anib_output_dir.legacyblastdir, orglengths, fraglengths, mode="ANIblastall"
+    fraglengths = aniblastall.get_fraglength_dict(anib_output_dir.fragfiles)
+    result = aniblastall.process_blast(
+        anib_output_dir.legacyblastdir, orglengths, fraglengths
     )
     assert_frame_equal(
         result.percentage_identity.sort_index(1).sort_index(),
@@ -257,10 +261,8 @@ def test_parse_legacy_blastdir(anib_output_dir):
 
 def test_parse_legacy_blasttab(anib_output):
     """Parses ANIB legacy .blast_tab output."""
-    fragdata = anib.get_fraglength_dict([anib_output.fragfile])
-    result = anib.parse_blast_tab(
-        anib_output.legacytabfile, fragdata, mode="ANIblastall"
-    )
+    fragdata = aniblastall.get_fraglength_dict([anib_output.fragfile])
+    result = aniblastall.parse_blast_tab(anib_output.legacytabfile, fragdata)
     assert (
         a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018])
     )
diff --git a/tests/test_concordance.py b/tests/test_concordance.py
index dcd0d583..3a9fa1b5 100644
--- a/tests/test_concordance.py
+++ b/tests/test_concordance.py
@@ -55,7 +55,7 @@
 import pytest
 
 from pyani import run_multiprocessing as run_mp
-from pyani import anib, anim, tetra, pyani_files
+from pyani import anib, aniblastall, anim, tetra, pyani_files
 
 
 def parse_jspecies(infile):
@@ -227,13 +227,13 @@ def test_anib_concordance(
         paths_concordance_fna, tmp_path, fragment_length
     )
     jobgraph = anib.make_job_graph(
-        paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIb", tmp_path)
+        paths_concordance_fna, fragfiles, anib.make_blastcmd_builder(tmp_path)
     )
     assert 0 == run_mp.run_dependency_graph(jobgraph)  # Jobs must run correctly
 
     # Process BLAST output
     result_pid = anib.process_blast(
-        tmp_path, orglengths, fraglengths, mode="ANIb"
+        tmp_path, orglengths, fraglengths
     ).percentage_identity
 
     # Compare JSpecies output to results. We do this in two blocks,
@@ -266,19 +266,19 @@ def test_aniblastall_concordance(
     orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna)
 
     # Perform ANIblastall on the input directory contents
-    fragfiles, fraglengths = anib.fragment_fasta_files(
+    fragfiles, fraglengths = aniblastall.fragment_fasta_files(
         paths_concordance_fna, tmp_path, fragment_length
     )
-    jobgraph = anib.make_job_graph(
+    jobgraph = aniblastall.make_job_graph(
         paths_concordance_fna,
         fragfiles,
-        anib.make_blastcmd_builder("ANIblastall", tmp_path),
+        aniblastall.make_blastcmd_builder(tmp_path),
     )
     assert 0 == run_mp.run_dependency_graph(jobgraph)  # Jobs must run correctly
 
     # Process BLAST output
-    result_pid = anib.process_blast(
-        tmp_path, orglengths, fraglengths, mode="ANIblastall"
+    result_pid = aniblastall.process_blast(
+        tmp_path, orglengths, fraglengths
     ).percentage_identity
 
     # Compare JSpecies output to results
diff --git a/tests/test_multiprocessing.py b/tests/test_multiprocessing.py
index 3757cd3a..f7e1dab1 100644
--- a/tests/test_multiprocessing.py
+++ b/tests/test_multiprocessing.py
@@ -96,7 +96,7 @@ def test_cmdsets(mp_dummy_cmds):
 def test_dependency_graph_run(path_fna_two, fragment_length, tmp_path):
     """Test that module runs dependency graph."""
     fragresult = fragment_fasta_files(path_fna_two, tmp_path, fragment_length)
-    blastcmds = make_blastcmd_builder("ANIb", tmp_path)
+    blastcmds = make_blastcmd_builder(tmp_path)
     jobgraph = make_job_graph(path_fna_two, fragresult[0], blastcmds)
     result = run_dependency_graph(jobgraph)
     assert 0 == result

From 6de8b0fe96c4fb8bae3b802237b762a17c510f98 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 21 Sep 2021 12:54:12 +0100
Subject: [PATCH 27/75] Add tests for `get_version()`

These got removed somewhere?
---
 tests/test_anib.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/test_anib.py b/tests/test_anib.py
index 1710e84f..e929b658 100644
--- a/tests/test_anib.py
+++ b/tests/test_anib.py
@@ -121,6 +121,34 @@ def anib_output_dir(dir_anib_in):
     )
 
 
+# Test get_version()
+# Test case 1: there is no executable
+def test_get_version_no_exe(executable_missing, monkeypatch):
+    """Test behaviour when there is no file at the specified executable location."""
+    test_file_1 = Path("/non/existent/blastn")
+    assert anib.get_version(test_file_1) == f"No blastn executable at {test_file_1}"
+
+
+# Test case 2: there is a file, but it is not executable
+def test_get_version_exe_not_executable(executable_not_executable, monkeypatch):
+    """Test behaviour when the file at the executable location is not executable."""
+    test_file_2 = Path("/non/executable/blastn")
+    assert (
+        anib.get_version(test_file_2)
+        == f"blastn exists at {test_file_2} but not executable"
+    )
+
+
+# Test case 3: there is an executable file, but the version can't be retrieved
+def test_get_version_exe_no_version(executable_without_version, monkeypatch):
+    """Test behaviour when the version for the executable can not be retrieved."""
+    test_file_3 = Path("/missing/version/blastn")
+    assert (
+        anib.get_version(test_file_3)
+        == f"blastn exists at {test_file_3} but could not retrieve version"
+    )
+
+
 # Test BLAST+ (blastn) command generation
 def test_blastn_dbjobdict(path_fna_all, tmp_path):
     """Generate dictionary of BLASTN+ database jobs."""

From 7da06d938d0a1848a40b607f027b0db2edddb5f4 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 21:53:15 +0100
Subject: [PATCH 28/75] Fixed typos and put one variable name in camelcase

---
 pyani/scripts/subcommands/subcmd_anib.py | 53 ++++++++++++------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index abf1cda6..f31bb895 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -79,7 +79,7 @@
 # implementation
 class ComparisonJob(NamedTuple):
 
-    """Pairwise comparison job for the SQLAlchemy implementation"""
+    """Pairwise comparison job for the SQLAlchemy implementation."""
 
     query: str
     subject: str
@@ -219,7 +219,7 @@ def subcmd_anib(args: Namespace) -> None:
         "Compiling pairwise comparisons (this can take time for large datasets)..."
     )
     comparisons = list(permutations(tqdm(genomes, disable=args.disable_tqdm), 2))
-    logger.info(f"\t...total parwise comparisons to be performed: {len(comparisons)}")
+    logger.info(f"\t...total pairwise comparisons to be performed: {len(comparisons)}")
 
     # Check for existing comparisons; if one has already been done (for the same
     # software package, version, and setting) we add the comparison to this run,
@@ -255,17 +255,17 @@ def subcmd_anib(args: Namespace) -> None:
         logger.debug(
             "\tIn this mode, existing comparison output from %s is reused", args.outdir
         )
-        existingfiles = collect_existing_output(args.outdir, "blastn", args)
-        if existingfiles:
+        existing_files = collect_existing_output(args.outdir, "blastn", args)
+        if existing_files:
             logger.debug(
                 "\tIdentified %s existing output files for reuse, %s (etc)",
-                len(existingfiles),
-                existingfiles[0],
+                len(existing_files),
+                existing_files[0],
             )
         else:
             logger.debug("\tIdentified no existing output files")
     else:
-        existingfiles = list()
+        existing_files = list()
         logger.debug("\tAssuming no pre-existing output files")
 
     ## Split the input genome files into contiguous fragments of the specified size,
@@ -286,7 +286,7 @@ def subcmd_anib(args: Namespace) -> None:
     # This method considered, but doesn't get fraglens
     # fragfiles =  [file for file in fragdir.iterdir()]
     joblist = generate_joblist(
-        comparisons_to_run, existingfiles, fragfiles.values(), fraglens.values(), args
+        comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args
     )
     logger.debug("...created %s blastn jobs", len(joblist))
     # print(f"Joblist: {joblist}")
@@ -308,7 +308,7 @@ def subcmd_anib(args: Namespace) -> None:
 
 def generate_joblist(
     comparisons: List,
-    existingfiles: List,
+    existing_files: List,
     fragfiles: List,
     fragsizes: List,
     args: Namespace,
@@ -316,14 +316,14 @@ def generate_joblist(
     """Return list of ComparisonJobs.
 
     :param comparisons:  list of (Genome, Genome) tuples for which comparisons are needed
-    :param existingfiles:  list of pre-existing BLASTN+ outputs
+    :param existing_files:  list of pre-existing BLASTN+ outputs
     :param fragfiles:  list of files containing genome fragments
     :param fragsizes:  list of fragment lengths
     :param args:  Namespace, command-line arguments
     """
     logger = logging.getLogger(__name__)
 
-    existingfiles = set(existingfiles)  # Path objects hashable
+    existing_files = set(existing_files)  # Path objects hashable
 
     joblist = []  # will hold ComparisonJob structs
     for idx, (query, subject) in enumerate(
@@ -339,16 +339,17 @@ def generate_joblist(
         outfname = Path(outprefix + ".blast_tab")
         logger.debug("Expected output file for db: %s", outfname)
 
-        # If e're in recovery mode, we don't want to repeat a computational
-        # comparison that already exists, so we check whether the ultimate
-        # output is in the set of existing files and, if not, we add the obs
+        # If we are in recovery mode, we are salvaging output from a previous
+        # run, and do not necessarily need to rerun all the jobs. In this case,
+        # we prepare a list of output files we want to recover from the results
+        # in the output directory.
 
         # The comparisons collection always gets updated, so that results are
         # added to the database whether they come from recovery mode or are run
         # in this call of the script.
-        if args.recovery and outfname in existingfiles:
+        if args.recovery and outfname in existing_files:
             logger.debug("Recovering output from %s, not submitting job", outfname)
-            # Need to track the expected output, but set the job itself to None:
+            # Need to track the expected output, but set the job itself to None.
             joblist.append(
                 ComparisonJob(query, subject, blastcmd, outfname, args.fragsize, None)
             )
@@ -449,18 +450,18 @@ def update_comparison_results(
     args: Namespace,
 ) -> None:
     """Update the Comparision table with the completed result set.
-    #
-       :param joblist:         list of ComparisonJob namedtuples
-       :param run:             Run ORM object for the current ANIb run
-       :param session:         active pyanidb session via ORM
-       :param blastn_version:  version of blastn used for the comparison
-       :param fraglens:     dictionary of fragment lengths for each genome
-       :param args:            command-line arguments for this run
-    #
-       The Comparison table stores individual comparison results, one per row.
+
+    :param joblist:         list of ComparisonJob namedtuples
+    :param run:             Run ORM object for the current ANIb run
+    :param session:         active pyanidb session via ORM
+    :param blastn_version:  version of blastn used for the comparison
+    :param fraglens:     dictionary of fragment lengths for each genome
+    :param args:            command-line arguments for this run
+
+    The Comparison table stores individual comparison results, one per row.
     """
     logger = logging.getLogger(__name__)
-    #
+
     # Add individual results to Comparison table
     for job in tqdm(joblist, disable=args.disable_tqdm):
         logger.debug("\t%s vs %s", job.query.description, job.subject.description)

From fba3662c7384878c58e565e9212f96ffad022113 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 21:54:22 +0100
Subject: [PATCH 29/75] Fix which file is passed as input and which as blastdb

---
 pyani/scripts/subcommands/subcmd_anib.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index f31bb895..9a8418f8 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -201,7 +201,7 @@ def subcmd_anib(args: Namespace) -> None:
         fraglens.update({genome: fragsizes})
 
         dbcmd, blastdbpath = anib.construct_makeblastdb_cmd(
-            fragpath, blastdbdir
+            Path(genome.path), blastdbdir  # fragpath, blastdbdir
         )  # args.outdir)
 
         subprocess.run(
@@ -329,10 +329,14 @@ def generate_joblist(
     for idx, (query, subject) in enumerate(
         tqdm(comparisons, disable=args.disable_tqdm)
     ):
-        # fname1 = Path(query.path)
-        # fname2 = Path(subject.path)
+        qprefix, qsuffix = (
+            args.outdir / "fragments" / Path(query.path).stem,
+            Path(query.path).suffix,
+        )
+        qfrags = Path(f"{qprefix}-fragments{qsuffix}")
+
         blastcmd = anib.generate_blastn_commands(
-            Path(query.path), Path(subject.path), args.outdir, args.blastn_exe
+            qfrags, Path(subject.path), args.outdir, args.blastn_exe
         )
         logger.debug("Commands to run:\n\t%s\n", blastcmd)
         outprefix = blastcmd.split()[2][:-10]  # prefix for blastn output
@@ -472,8 +476,8 @@ def update_comparison_results(
             Comparison(
                 query=job.query,
                 subject=job.subject,
-                aln_length=aln_length,
-                sim_errs=sim_errs,
+                aln_length=int(aln_length),
+                sim_errs=int(sim_errs),
                 identity=ani_pid,
                 cov_query=qcov,
                 cov_subject=scov,

From 1fb3c0830b953b98ba2c4f1010847253cd93cbcd Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 21:55:23 +0100
Subject: [PATCH 30/75] Move serialisation of fragment dictionary to primary
 function

---
 pyani/scripts/subcommands/subcmd_anib.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 9a8418f8..3f76cae8 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -212,7 +212,9 @@ def subcmd_anib(args: Namespace) -> None:
             check=False,
         )
 
-        add_blastdb(session, genome, run, fragpath, blastdbpath, fragsizes, dbcmd)
+        add_blastdb(
+            session, genome, run, fragpath, blastdbpath, json.dumps(fragsizes), dbcmd
+        )
 
     # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples
     logger.info(
@@ -400,7 +402,7 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path
     # Write fragments to output file
     fragpath = outdir / f"{inpath.stem}-fragments.fna"
     SeqIO.write(outseqs, fragpath, "fasta")
-    return fragpath, json.dumps(sizedict)
+    return fragpath, sizedict
 
 
 def run_anib_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:

From 6c12a72c299cd2aa0ff9d012b8bef8392439ccae Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 21:56:07 +0100
Subject: [PATCH 31/75] Added useful debugging lines to `anib.py`

May be removed later on
---
 pyani/scripts/subcommands/subcmd_anib.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 3f76cae8..525e67e1 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -472,6 +472,8 @@ def update_comparison_results(
     for job in tqdm(joblist, disable=args.disable_tqdm):
         logger.debug("\t%s vs %s", job.query.description, job.subject.description)
         aln_length, sim_errs, ani_pid = anib.parse_blast_tab(job.outfile, fraglens)
+        logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}")
+        logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}")
         qcov = aln_length / job.query.length
         scov = aln_length / job.subject.length
         run.comparisons.append(

From 419db64265d5c98c0e897f0fc98cff04e1ad019a Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 21:56:56 +0100
Subject: [PATCH 32/75] Add long versions of CLI flags to `anib_parser.py`

---
 pyani/scripts/parsers/anib_parser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyani/scripts/parsers/anib_parser.py b/pyani/scripts/parsers/anib_parser.py
index 7473afc8..b2b99785 100644
--- a/pyani/scripts/parsers/anib_parser.py
+++ b/pyani/scripts/parsers/anib_parser.py
@@ -65,6 +65,7 @@ def build(
     )
     parser.add_argument(
         "-i",
+        "--indir",
         action="store",
         dest="indir",
         default=None,
@@ -73,6 +74,7 @@ def build(
     )
     parser.add_argument(
         "-o",
+        "--outdir",
         action="store",
         dest="outdir",
         default=None,

From fd41e4cbb0c5b79c05d3dbc5e9ce38cad59fe1df Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:00:32 +0100
Subject: [PATCH 33/75] Comment out (likely) unnecessary function

Will be removed later
---
 pyani/anib.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 4b5ec3e3..65b40c60 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -324,26 +324,26 @@ def make_job_graph(
     return joblist
 
 
-# Generate list of makeblastdb command lines from passed filenames
-def generate_blastdb_commands(
-    filenames: List[Path],
-    outdir: Path,
-    blastdb_exe: Optional[Path] = None,
-) -> List[Tuple[str, Path]]:
-    """Return list of makeblastdb command-lines for ANIb.
-
-    :param filenames:  a list of paths to input FASTA files
-    :param outdir:  path to output directory
-    :param blastdb_exe:  path to the makeblastdb executable
-    :param method:  str, ANIb analysis type (ANIb)
-    """
-    if blastdb_exe is None:
-        cmdlines = [construct_makeblastdb_cmd(fname, outdir) for fname in filenames]
-    else:
-        cmdlines = [
-            construct_makeblastdb_cmd(fname, outdir, blastdb_exe) for fname in filenames
-        ]
-    return cmdlines
+## Generate list of makeblastdb command lines from passed filenames
+# def generate_blastdb_commands(
+#     filenames: List[Path],
+#     outdir: Path,
+#     blastdb_exe: Optional[Path] = None,
+# ) -> List[Tuple[str, Path]]:
+#     """Return list of makeblastdb command-lines for ANIb.
+#
+#     :param filenames:  a list of paths to input FASTA files
+#     :param outdir:  path to output directory
+#     :param blastdb_exe:  path to the makeblastdb executable
+#     :param method:  str, ANIb analysis type (ANIb)
+#     """
+#     if blastdb_exe is None:
+#         cmdlines = [construct_makeblastdb_cmd(fname, outdir) for fname in filenames]
+#     else:
+#         cmdlines = [
+#             construct_makeblastdb_cmd(fname, outdir, blastdb_exe) for fname in filenames
+#         ]
+#     return cmdlines
 
 
 # Generate single makeblastdb command line

From 104178aa748f896e21815a6e9319673c5b7b4e14 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:00:58 +0100
Subject: [PATCH 34/75] Update docstrings to reflect new comparison handling

---
 pyani/anib.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 65b40c60..5683fd20 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -254,7 +254,6 @@ def make_blastcmd_builder(
 ) -> BLASTcmds:
     """Return BLASTcmds object for construction of BLAST commands.
 
-    :param method:  str, the kind of ANI analysis (ANIb)
     :param outdir:
     :param format_exe:
     :param blast_exe:
@@ -400,8 +399,8 @@ def construct_blastn_cmdline(
 ) -> str:
     """Return a single blastn command.
 
-    :param fname1:  Path, FASTA file for query genome
-    :param fname2:  Path, database of fragments for subject genome
+    :param query:  Path, FASTA file for query genome
+    :param subj_db:  Path, database of fragments for subject genome
     :param outdir:  Path, to the output directory
     :param blastn_exe:  str, path to blastn executable
     """

From 37477db937903cee676b451bf033b20c89f3f553 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:01:16 +0100
Subject: [PATCH 35/75] Add useful debugging lines

May be removed later
---
 pyani/anib.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyani/anib.py b/pyani/anib.py
index 5683fd20..43101ebc 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -83,6 +83,7 @@
 
 import os
 import platform
+import logging
 import re
 import shutil
 import subprocess
@@ -509,6 +510,8 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
     over an alignable region of at least 70% of their length.
     '''
     """
+    logger = logging.getLogger(__name__)
+
     # Load output as dataframe
     columns = [
         "sbjct_id",
@@ -535,6 +538,8 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
         data.columns = columns
     except pd.io.common.EmptyDataError:
         data = pd.DataFrame(columns=columns)
+    # logger.debug(f"Index: {[idx for idx in data.index]}")
+    logger.debug(f"data: {data.head()}")
     # Add new columns for recalculated alignment length, proportion, and
     # percentage identity
     data["ani_alnlen"] = data["blast_alnlen"] - data["blast_gaps"]

From 8cf205b0881a9ceca7d1de0e2439b61a8be31dd3 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:01:34 +0100
Subject: [PATCH 36/75] Populate `aniblastall_parser.py`

---
 pyani/scripts/parsers/aniblastall_parser.py | 62 +++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/pyani/scripts/parsers/aniblastall_parser.py b/pyani/scripts/parsers/aniblastall_parser.py
index 10283210..364769ec 100644
--- a/pyani/scripts/parsers/aniblastall_parser.py
+++ b/pyani/scripts/parsers/aniblastall_parser.py
@@ -39,10 +39,14 @@
 """Provides parser for aniblastall subcommand."""
 
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, _SubParsersAction
+from pathlib import Path
 from typing import List, Optional
 
+from pyani import pyani_config
 from pyani.scripts import subcommands
 
+from pyani import __version__
+
 
 def build(
     subps: _SubParsersAction, parents: Optional[List[ArgumentParser]] = None
@@ -51,8 +55,66 @@ def build(
 
     :param subps:  collection of subparsers in main parser
     :param parents:  parsers from which arguments are inherited
+
+    The terminology may be confusing, but in practice the main parser collects
+    command-line arguments that are then available to this parser, which inherits
+    options from the parsers in `parents` in addition to those defined below.
     """
     parser = subps.add_parser(
         "aniblastall", parents=parents, formatter_class=ArgumentDefaultsHelpFormatter
     )
+    # Required arguments
+    parser.add_argument(
+        "-i",
+        "--indir",
+        action="store",
+        dest="indir",
+        default=None,
+        required=True,
+        type=Path,
+        help="input genome directory (required)",
+    )
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        action="store",
+        dest="outdir",
+        default=None,
+        required=True,
+        type=Path,
+        help="output analysis results directory (required)",
+    )
+    # Optional arguments
+    parser.add_argument(
+        "--dbpath",
+        action="store",
+        dest="dbpath",
+        default=Path(".pyani/pyanidb"),
+        type=Path,
+        help="path to pyani database",
+    )
+    parser.add_argument(
+        "--blastall_exe",
+        action="store",
+        dest="blastall_exe",
+        default=pyani_config.BLASTALL_DEFAULT,
+        type=Path,
+        help="path to blastall executable",
+    )
+    parser.add_argument(
+        "--format_exe",
+        dest="format_exe",
+        action="store",
+        default=pyani_config.FORMATDB_DEFAULT,
+        type=Path,
+        help="path to formatdb executable",
+    )
+    parser.add_argument(
+        "--fragsize",
+        dest="fragsize",
+        action="store",
+        type=int,
+        default=pyani_config.FRAGSIZE,
+        help="blastn query fragment size",
+    )
     parser.set_defaults(func=subcommands.subcmd_aniblastall)

From afd1be15a48254d518fca7657f1043f495c59de5 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:15:34 +0100
Subject: [PATCH 37/75] Comment out (likely) unnecessary function

Will be removed later
---
 pyani/aniblastall.py | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index 24c9f901..ed2b9cdb 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -293,27 +293,26 @@ def make_job_graph(
     return joblist
 
 
-# Generate list of makeblastdb command lines from passed filenames
-def generate_blastdb_commands(
-    filenames: List[Path],
-    outdir: Path,
-    blastdb_exe: Optional[Path] = None,
-    # mode: "ANIblastall"
-) -> List[Tuple[str, Path]]:
-    """Return list of makeblastdb command-lines for ANIblastall.
-
-    :param filenames:  a list of paths to input FASTA files
-    :param outdir:  path to output directory
-    :param blastdb_exe:  path to the makeblastdb executable
-    :param method:  str, ANI analysis type (ANIblastall)
-    """
-    if blastdb_exe is None:
-        cmdlines = [construct_formatdb_cmd(fname, outdir) for fname in filenames]
-    else:
-        cmdlines = [
-            construct_formatdb_cmd(fname, outdir, blastdb_exe) for fname in filenames
-        ]
-    return cmdlines
+## Generate list of makeblastdb command lines from passed filenames
+# def generate_blastdb_commands(
+#     filenames: List[Path],
+#     outdir: Path,
+#     blastdb_exe: Optional[Path] = None,
+# ) -> List[Tuple[str, Path]]:
+#     """Return list of makeblastdb command-lines for ANIblastall.
+#
+#     :param filenames:  a list of paths to input FASTA files
+#     :param outdir:  path to output directory
+#     :param blastdb_exe:  path to the makeblastdb executable
+#     :param method:  str, ANI analysis type (ANIblastall)
+#     """
+#     if blastdb_exe is None:
+#         cmdlines = [construct_formatdb_cmd(fname, outdir) for fname in filenames]
+#     else:
+#         cmdlines = [
+#             construct_formatdb_cmd(fname, outdir, blastdb_exe) for fname in filenames
+#         ]
+#     return cmdlines
 
 
 # Generate single makeblastdb command line

From 7dc1113deaaca649ef27ace1d40b2ceaf5f71f03 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:22:53 +0100
Subject: [PATCH 38/75] Simplify generation of command lines and update
 supporting docstrings

---
 pyani/aniblastall.py | 47 +++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index ed2b9cdb..cd5cdd06 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -331,14 +331,17 @@ def construct_formatdb_cmd(
 
 
 # Generate list of BLASTN command lines from passed filenames
-def generate_blastn_commands(
-    filenames: List[Path],
+def generate_blastall_commands(
+    # filenames: List[Path],
+    query: Path,
+    subject: Path,
     outdir: Path,
-    blast_exe: Optional[Path] = None,
+    blastall_exe: Optional[Path] = None,
 ) -> List[str]:
     """Return a list of blastn command-lines for ANIblastall.
 
-    :param filenames:  a list of paths to fragmented input FASTA files
+    :param query:  a paths to the query's fragmented input FASTA file
+    :param subject:  a paths to the subject's fragmented input FASTA file
     :param outdir:  path to output directory
     :param blastn_exe:  path to BLASTN executable
     :param method:  str, analysis type (ANIblastall)
@@ -348,41 +351,31 @@ def generate_blastn_commands(
     have the form ACCESSION.ext. This is the convention followed by the
     fragment_FASTA_files() function above.
     """
-    cmdlines = []
-    for idx, fname1 in enumerate(filenames[:-1]):
-        dbname1 = Path(str(fname1).replace("-fragments", ""))
-        for fname2 in filenames[idx + 1 :]:
-            dbname2 = Path(str(fname2).replace("-fragments", ""))
-            if blast_exe is None:
-                cmdlines.append(construct_blastall_cmdline(fname1, dbname2, outdir))
-                cmdlines.append(construct_blastall_cmdline(fname2, dbname1, outdir))
-            else:
-                cmdlines.append(
-                    construct_blastall_cmdline(fname1, dbname2, outdir, blast_exe)
-                )
-                cmdlines.append(
-                    construct_blastall_cmdline(fname2, dbname1, outdir, blast_exe)
-                )
-    return cmdlines
+    subj_db = outdir / "blastalldbs" / Path(str(subject.name))
+    if blastall_exe is None:
+        cmdline = construct_blastall_cmdline(query, subj_db, outdir)
+    else:
+        cmdline = construct_blastall_cmdline(query, subj_db, outdir, blastall_exe)
+        return cmdline
 
 
 # Generate single BLASTALL command line
 def construct_blastall_cmdline(
-    fname1: Path,
-    fname2: Path,
+    query: Path,
+    subj_db: Path,
     outdir: Path,
     blastall_exe: Path = pyani_config.BLASTALL_DEFAULT,
 ) -> str:
     """Return single blastall command.
 
-    :param fname1:
-    :param fname2:
-    :param outdir:
     :param blastall_exe:  str, path to BLASTALL executable
+    :param query:  Path, FASTA file for query genome
+    :param subj_db:  Path, database of fragments for subject genome
+    :param outdir:  Path, to the output directory
     """
-    prefix = outdir / f"{fname1.stem.replace('-fragments', '')}_vs_{fname2.stem}"
+    prefix = Path(outdir) / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}"
     return (
-        f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {fname1} -d {fname2} "
+        f"{blastall_exe} -p blastn -o {prefix}.blast_tab -i {query} -d {subj_db} "
         "-X 150 -q -1 -F F -e 1e-15 -b 1 -v 1 -m 8"
     )
 

From ee90a68a23e40346e612067389c5c0753a43aeef Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:23:29 +0100
Subject: [PATCH 39/75] Update command name referenced in docstrings

---
 pyani/aniblastall.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index cd5cdd06..33eac9d7 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -223,7 +223,6 @@ def make_blastcmd_builder(
 ) -> BLASTcmds:
     """Return BLASTcmds object for construction of BLAST commands.
 
-    :param method:  str, the kind of ANI analysis (ANIblastall)
     :param outdir:
     :param format_exe:
     :param blast_exe:
@@ -343,7 +342,7 @@ def generate_blastall_commands(
     :param query:  a paths to the query's fragmented input FASTA file
     :param subject:  a paths to the subject's fragmented input FASTA file
     :param outdir:  path to output directory
-    :param blastn_exe:  path to BLASTN executable
+    :param blastall_exe:  path to BLASTALL executable
     :param method:  str, analysis type (ANIblastall)
 
     Assumes that the fragment sequence input filenames have the form
@@ -368,10 +367,10 @@ def construct_blastall_cmdline(
 ) -> str:
     """Return single blastall command.
 
-    :param blastall_exe:  str, path to BLASTALL executable
     :param query:  Path, FASTA file for query genome
     :param subj_db:  Path, database of fragments for subject genome
     :param outdir:  Path, to the output directory
+    :param blastall_exe:  str, path to blastall executable
     """
     prefix = Path(outdir) / f"{query.stem.replace('-fragments', '')}_vs_{subj_db.stem}"
     return (

From e8f6e417d1ac18a489fd1b744bafc3ff2775ea9e Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:24:11 +0100
Subject: [PATCH 40/75] Fix which files are passed as input and blastdb in
 command generation

---
 pyani/aniblastall.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index 33eac9d7..03f50299 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -324,7 +324,9 @@ def construct_formatdb_cmd(
     :param outdir:  Path, path to output directory
     :param blastdb_exe:  Path, path to the formatdb executable
     """
-    newfilename = Path(outdir) / Path(filename.name)
+    newfilename = (
+        Path(outdir) / Path(filename).name
+    )  # Path(filename.name.replace("-fragments", ""))
     shutil.copy(filename, newfilename)
     return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename)
 

From c0c518aaccd6a3547380021ed689cc5413453013 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:24:46 +0100
Subject: [PATCH 41/75] Add useful debugging lines

May be removed later
---
 pyani/aniblastall.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index 03f50299..6e3f4899 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -324,10 +324,12 @@ def construct_formatdb_cmd(
     :param outdir:  Path, path to output directory
     :param blastdb_exe:  Path, path to the formatdb executable
     """
+    logger = logging.getLogger(__name__)
     newfilename = (
         Path(outdir) / Path(filename).name
     )  # Path(filename.name.replace("-fragments", ""))
     shutil.copy(filename, newfilename)
+    logger.debug(f"mv {filename} {newfilename}")
     return (f"{blastdb_exe} -p F -i {newfilename} -t {filename.stem}", newfilename)
 
 
@@ -401,10 +403,17 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
     over an alignable region of at least 70% of their length.
     '''
     """
+    logger = logging.getLogger(__name__)
+
     # Assuming that the filename format holds org1_vs_org2.blast_tab:
+    logger.debug(f"Filename: {filename}")
     qname = filename.stem.split("_vs_")[0]
+    logger.debug(f"Qname: {qname}")
     # Load output as dataframe
+    logger.debug(f"Keys: {fraglengths.keys()}")
     qfraglengths = fraglengths[qname]
+    # logger.debug(f"Qkeys: {qfraglengths.keys()}")
+    logger.debug(f"Qfraglengths: {type(qfraglengths)}")
     columns = [
         "sid",
         "blast_pid",
@@ -422,12 +431,14 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
     # regions of homology. This causes pandas to throw an error on CSV import.
     # To get past this, we create an empty dataframe with the appropriate
     # columns.
+    # logger.debug(pd.read_csv(filename, header=None, index_col=0))
     try:
         data = pd.read_csv(filename, header=None, sep="\t", index_col=0)
         data.columns = columns
     except pd.io.common.EmptyDataError:
         data = pd.DataFrame(columns=columns)
     # Add new column for fragment length, only for BLASTALL
+    logger.debug(f"Data: {data.head()}")
     data["qlen"] = pd.Series(
         [qfraglengths[idx] for idx in data.index], index=data.index
     )
@@ -437,6 +448,8 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
     data["ani_alnids"] = data["ani_alnlen"] - data["blast_mismatch"]
     data["ani_coverage"] = data["ani_alnlen"] / data["qlen"]
     data["ani_pid"] = data["ani_alnids"] / data["qlen"]
+    logger.debug(f"data: {data.head()}")
+
     # Filter rows on 'ani_coverage' > 0.7, 'ani_pid' > 0.3
     filtered = data[(data["ani_coverage"] > 0.7) & (data["ani_pid"] > 0.3)]
     # Dedupe query hits, so we only take the best hit
@@ -457,8 +470,10 @@ def parse_blast_tab(filename: Path, fraglengths: Dict) -> Tuple[int, int, int]:
     if pd.isnull(ani_pid):  # Happens if there are no matches in ANIb
         ani_pid = 0
     aln_length = filtered["ani_alnlen"].sum()
+    logger.debug(f"Alignment length, aniblastall.py: {aln_length.sum()}")
     sim_errors = filtered["blast_mismatch"].sum() + filtered["blast_gaps"].sum()
     filtered.to_csv(Path(filename).with_suffix(".blast_tab.dataframe"), sep="\t")
+    logger.debug(f"Results, aniblastall.py: {aln_length}, {sim_errors}, {ani_pid}")
     return aln_length, sim_errors, ani_pid
 
 
@@ -485,6 +500,7 @@ def process_blast(
     May throw a ZeroDivisionError if one or more BLAST runs failed, or a
     very distant sequence was included in the analysis.
     """
+    logger = logging.getLogger(__name__)
     # Process directory to identify input files
     blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab")
     # Hold data in ANIResults object
@@ -518,6 +534,7 @@ def process_blast(
                 )
             continue
         resultvals = parse_blast_tab(blastfile, fraglengths)
+        logger.debug(f"Resultvals: {resultvals}")
         query_cover = float(resultvals[0]) / org_lengths[qname]
 
         # Populate dataframes: when assigning data, we need to note that

From dd1ee44b66ae65b821b89842994b414ff6b0b1a3 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:33:12 +0100
Subject: [PATCH 42/75] Add imports and convenience struct for `ComparisonJob`

---
 .../scripts/subcommands/subcmd_aniblastall.py | 49 ++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index f91208cd..077860db 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -39,11 +39,58 @@
 # THE SOFTWARE.
 """Provides the aniblastall subcommand for pyani."""
 
+import datetime
+import json
+import logging
+import os
+import subprocess
+
 from argparse import Namespace
 from logging import Logger
+from itertools import permutations
+from pathlib import Path
+from typing import List, NamedTuple, Tuple, Dict
+
+from Bio import SeqIO
+from tqdm import tqdm
+
+from pyani import (
+    PyaniException,
+    aniblastall,
+    pyani_config,
+    pyani_jobs,
+    run_sge,
+    run_multiprocessing as run_mp,
+)
+from pyani.pyani_files import collect_existing_output
+from pyani.pyani_orm import (
+    add_run,
+    add_run_genomes,
+    add_blastdb,
+    Comparison,
+    filter_existing_comparisons,
+    get_session,
+    PyaniORMException,
+    update_comparison_matrices,
+)
+from pyani.pyani_tools import termcolor
+
+
+# Convenience struct describing a pairwise comparison job for the SQLAlchemy
+# implementation
+class ComparisonJob(NamedTuple):
+
+    """Pairwise comparison job for the SQLAlchemy implementation."""
+
+    query: str
+    subject: str
+    blastcmd: str
+    outfile: Path
+    fragsize: int
+    job: pyani_jobs.Job
 
 
-def subcmd_aniblastall(args: Namespace):
+def subcmd_aniblastall(args: Namespace) -> None:
     """Perform ANIblastall on all genome files in an input directory.
 
     :param args:

From 8cfbfe8b16d4385ca61c2374c7b80428db3fdc24 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:35:19 +0100
Subject: [PATCH 43/75] Add initial code for starting a run and adding it to
 the database

---
 .../scripts/subcommands/subcmd_aniblastall.py | 51 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 077860db..bcdffd53 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -93,7 +93,54 @@ class ComparisonJob(NamedTuple):
 def subcmd_aniblastall(args: Namespace) -> None:
     """Perform ANIblastall on all genome files in an input directory.
 
-    :param args:
+    :param args:  Namespace, command-line arguments
     :param logger:
+
+    Finds ANI by the ANIblastall method, as described in ... some paper.
+
+    All FASTA format files (selected by suffix) in the input directory are fragmented into (by default 1020nt) consecutive sections, and a BLASTALL database constructed from the whole genome input. The BLAST+ blastall tool is then used to query each set of fragments against each BLAST+ database, in turn.
+
+    For each query, the BLAST+ .tab output is parsed to obtain alignment length, identity and similarity error count. Alignments below a threshold are not included in the calculation (this introduces systematic bias with respect to ANIm). The results are processed to calculate the ANI percentages, coverage, and similarity error.
+
+    The calculated values are stored in the local SQLite3 database.
     """
-    raise NotImplementedError
+    # Create logger
+    logger = logging.getLogger(__name__)
+
+    # Announce the analysis
+    logger.info(termcolor("Running ANIblastall analysis", "red"))
+
+    # Get BLASTALL version - this will be used in the database entreis
+    blastall_version = aniblastall.get_version(args.blastall_exe)
+    logger.info(termcolor(f"BLAST+ blastall version: {blastall_version}", "cyan"))
+
+    # Use provided name, or make new one for this analysis
+    start_time = datetime.datetime.now()
+    name = args.name or "_".join(["ANIblastall", start_time.isoformat()])
+    logger.info(termcolor(f"Analysis name: {name}", "cyan"))
+
+    # Connect to existing database (which may be "clean" or have old analyses)
+    logger.debug(f"Connecting to database {args.dbpath}")
+    try:
+        session = get_session(args.dbpath)
+    except Exception:
+        logger.error(
+            f"Could not connect to database {args.dbpath} (exiting)", exc_info=True
+        )
+        raise SystemExit(1)
+
+    # Add information about this run to the database
+    logger.debug(f"Adding run info to database {args.dbpath}...")
+    try:
+        run = add_run(
+            session,
+            method="ANIblastall",
+            cmdline=args.cmdline,
+            date=start_time,
+            status="started",
+            name=name,
+        )
+    except PyaniORMException:
+        logger.error("Could not add run to the database (exiting)", exc_info=True)
+        raise SystemExit(1)
+    logger.debug(f"\t...added run ID: {run} to the database")

From 2aa686b5140cb96debcd55fabd8b3319fb0c77c6 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:36:03 +0100
Subject: [PATCH 44/75] Add genomes for the ru  to the database

---
 pyani/scripts/subcommands/subcmd_aniblastall.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index bcdffd53..ac186a51 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -144,3 +144,16 @@ def subcmd_aniblastall(args: Namespace) -> None:
         logger.error("Could not add run to the database (exiting)", exc_info=True)
         raise SystemExit(1)
     logger.debug(f"\t...added run ID: {run} to the database")
+
+    # Identify input files for comparison, and populate the database
+    logger.debug(f"Adding files for {run} to database...")
+    try:
+        genome_ids = add_run_genomes(
+            session, run, args.indir, args.classes, args.labels
+        )
+    except PyaniORMException:
+        logger.error(
+            f"Could not add genomes to database for run {run} (exiting)", exc_info=True
+        )
+        raise SystemExit(1)
+    logger.debug(f"\t...added gnome IDs: {genome_ids}")

From 10bd2a27296422bd6fd9626969fe0a82c580baf2 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:43:16 +0100
Subject: [PATCH 45/75] Get genomes for the run and create output directories

---
 .../scripts/subcommands/subcmd_aniblastall.py | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index ac186a51..53448458 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -157,3 +157,24 @@ def subcmd_aniblastall(args: Namespace) -> None:
         )
         raise SystemExit(1)
     logger.debug(f"\t...added gnome IDs: {genome_ids}")
+
+    # Get list of genomes for this anlaysis from the database
+    logger.info("Compiling genomes for comparison")
+    genomes = run.genomes.all()
+    logger.debug(f"\tCollected {len(genomes)} genomes for this run")
+
+    # Create output directories. We create the amin parent directory (args.outdir), but
+    # also subdirectories for the BLAST databases.
+    logger.debug(f"Creating output directory {args.outdir}")
+    try:
+        os.makedirs(args.outdir, exist_ok=True)
+    except IOError:
+        logger.error(
+            f"Could not create output directory {args.outdir} (exiting)", exc_info=True
+        )
+        raise SystemError(1)
+    fragdir = Path(str(args.outdir)) / "fragments"
+    blastdbdir = Path(str(args.outdir)) / "blastalldbs"
+    logger.debug("\t...creating subdirectories")
+    os.makedirs(fragdir, exist_ok=True)
+    os.makedirs(blastdbdir, exist_ok=True)

From 8df94c7be8a965d5f4211faa3b574f108c2193ec Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:49:34 +0100
Subject: [PATCH 46/75] Add code to fragment fasta files

---
 .../scripts/subcommands/subcmd_aniblastall.py | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 53448458..57505291 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -178,3 +178,49 @@ def subcmd_aniblastall(args: Namespace) -> None:
     logger.debug("\t...creating subdirectories")
     os.makedirs(fragdir, exist_ok=True)
     os.makedirs(blastdbdir, exist_ok=True)
+
+    # Create a new sequence fragment file and a new BLAST+ database for each input genome,
+    # and add this data to the database as a row in BlastDB
+    logger.info("Creating input sequence fragment files")
+    fragfiles = {}
+    fraglens = {}
+    for genome in genomes:
+        fragpath, fragsizes = fragment_fasta_file(
+            Path(str(genome.path)), Path(str(fragdir)), args.fragsize
+        )
+        logger.info(f"fragsizes: {type(fragsizes)}")
+        fragfiles.update({Path(genome.path).stem: fragpath})
+        fraglens.update({Path(genome.path).stem: fragsizes})
+
+
+def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]:
+    """Return path to fragmented sequence file and JSON of fragment lengths.
+
+    :param inpath:  Path to genome file
+    :param outdir:  Path to directory to hold fragmented files
+    :param fragsize:  size of genome fragments
+
+    Returns a tuple of ``(path, json)`` where ``path`` is the path to the fragment
+    file and ``json`` is a JSON-ified dictionary of fragment lengths, keyed by
+    fragment sequence ID.
+    """
+    # Generate fragments for the input sequence, looping over each contig/
+    # chromosome in the input file and breaking into sections of length
+    # fragsize
+    sizedict = {}
+    outseqs = []
+    count = 0
+    for seq in SeqIO.parse(inpath, "fasta"):
+        idx = 0
+        while idx < len(seq):
+            count += 1
+            newseq = seq[idx : idx + fragsize]
+            newseq.id = f"frag{count:05d}"
+            outseqs.append(newseq)
+            sizedict[newseq.id] = len(newseq)
+            idx += fragsize
+
+    # Write fragments to output file
+    fragpath = outdir / f"{inpath.stem}-fragments.fna"
+    SeqIO.write(outseqs, fragpath, "fasta")
+    return fragpath, sizedict

From e42c0890eb281ee05fbbc236f0c344fb2936f79e Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:50:09 +0100
Subject: [PATCH 47/75] Add code to create `blastdb`

---
 .../scripts/subcommands/subcmd_aniblastall.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 57505291..7d06bae9 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -192,6 +192,25 @@ def subcmd_aniblastall(args: Namespace) -> None:
         fragfiles.update({Path(genome.path).stem: fragpath})
         fraglens.update({Path(genome.path).stem: fragsizes})
 
+        logger.info("Constructing formatdb command")
+        dbcmd, blastdbpath = aniblastall.construct_formatdb_cmd(
+            Path(genome.path), blastdbdir
+        )
+
+        logger.info("Running subprocess")
+        subprocess.run(
+            dbcmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=False,
+        )
+
+        logger.info("Adding blastdb")
+        add_blastdb(
+            session, genome, run, fragpath, blastdbpath, json.dumps(fragsizes), dbcmd
+        )
+
 
 def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]:
     """Return path to fragmented sequence file and JSON of fragment lengths.

From 05665ee8b56263f7882190f72ead28ad7ee5e06e Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:51:13 +0100
Subject: [PATCH 48/75] Add code to generate list of comparisons and filter
 existing ones

---
 .../scripts/subcommands/subcmd_aniblastall.py  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 7d06bae9..e3251c5a 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -211,6 +211,24 @@ def subcmd_aniblastall(args: Namespace) -> None:
             session, genome, run, fragpath, blastdbpath, json.dumps(fragsizes), dbcmd
         )
 
+    # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples
+    logger.info(
+        "Compiling pairwise comparisons (this can take time for large datasets)..."
+    )
+    comparisons = list(permutations(tqdm(genomes, disable=args.disable_tqdm), 2))
+    logger.info(f"\t...total pairwise comparisons to be performed: {len(comparisons)}")
+
+    # Check for existing comparisons; if one has already been done (for the same
+    # software package, version, and setting) we add the comparison to this run,
+    # but remove it from the list of comparisons to be performed
+    logger.info("Checking database for existing comparison data...")
+    comparisons_to_run = filter_existing_comparisons(
+        session, run, comparisons, "blastall", blastall_version, args.fragsize, False
+    )
+    logger.info(
+        f"\t...after check, still need to run {len(comparisons_to_run)} comparisons"
+    )
+
 
 def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]:
     """Return path to fragmented sequence file and JSON of fragment lengths.

From 502becf8bfc4520eb5ca83c7ff7bf0bee96539ef Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:51:48 +0100
Subject: [PATCH 49/75] Add code for case where all comparisons have been run
 already

---
 pyani/scripts/subcommands/subcmd_aniblastall.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index e3251c5a..5b2da3a5 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -229,6 +229,19 @@ def subcmd_aniblastall(args: Namespace) -> None:
         f"\t...after check, still need to run {len(comparisons_to_run)} comparisons"
     )
 
+    # If there are no comparisons to run, update the Run matrices and exit
+    # from this function
+    if not comparisons_to_run:
+        logger.info(
+            termcolor(
+                "All comparison results present in database (skipping comparisons)",
+                "magenta",
+            )
+        )
+        logger.info("Updating summary matrices with existing results")
+        update_comparison_matrices(session, run)
+        return
+
 
 def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]:
     """Return path to fragmented sequence file and JSON of fragment lengths.

From 3589efc509087ca2732c27165bc0a6dff1f8cb76 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 22:53:22 +0100
Subject: [PATCH 50/75] Add code for recovery mode

---
 .../scripts/subcommands/subcmd_aniblastall.py | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 5b2da3a5..76023372 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -242,6 +242,27 @@ def subcmd_aniblastall(args: Namespace) -> None:
         update_comparison_matrices(session, run)
         return
 
+    # If we are in recovery mode, we are salvaging output from a previous
+    # run, and do not necessarily need to rerun all the jobs. In this case,
+    # we prepare a list of output files we want to recover from the results
+    # in the output directory.
+    # ¶ Should this use output files, or pull from the database?
+    if args.recovery:
+        logger.warning("Entering recovery mode...")
+        logger.debug(
+            f"\tIn this mode, existing comparison output from {args.outdir} is reused"
+        )
+        existing_files = collect_existing_output(args.outdir, "blastall", args)
+        if existing_files:
+            logger.debug(
+                f"\tIdentified {len(existing_files)} existing output files for reuse, existing_files[0] (et cetera)"
+            )
+        else:
+            logger.debug("\tIdentified no existing output files")
+    else:
+        existing_files = list()
+        logger.debug("\tAssuming no pre-existing output files")
+
 
 def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]:
     """Return path to fragmented sequence file and JSON of fragment lengths.

From c74bf3a11ada66020ea2938ee96dfb7bf6103a92 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:03:40 +0100
Subject: [PATCH 51/75] Rename `blastcmd` to `blastallcmd`

---
 pyani/scripts/subcommands/subcmd_aniblastall.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 76023372..95e65dbc 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -84,7 +84,7 @@ class ComparisonJob(NamedTuple):
 
     query: str
     subject: str
-    blastcmd: str
+    blastallcmd: str
     outfile: Path
     fragsize: int
     job: pyani_jobs.Job

From 953be895f3ac790d71a34b9c9307996ee46e6553 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:07:44 +0100
Subject: [PATCH 52/75] Add `generate_joblist()`

---
 .../scripts/subcommands/subcmd_aniblastall.py | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 95e65dbc..37b35503 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -264,6 +264,73 @@ def subcmd_aniblastall(args: Namespace) -> None:
         logger.debug("\tAssuming no pre-existing output files")
 
 
+def generate_joblist(
+    comparisons: List,
+    existing_files: List,
+    fragfiles: List,
+    fragsizes: List,
+    args: Namespace,
+) -> List[ComparisonJob]:
+    """Return list of ComparisonJobs.
+
+    :param comparisons:  list of (Genome, Genome) tuples for which comparisons are needed
+    :param existing_files:  list of pre-existing BLASTALL outputs
+    :param fragfiles:  list of files containing genome fragments
+    :param fragsizes:  list of fragment lengths
+    :param args:  Namespace, command-line arguments
+    """
+    logger = logging.getLogger(__name__)
+
+    existing_files = set(existing_files)  # Path objects hashable
+
+    joblist = []  # will hold ComparisonJob structs
+    for idx, (query, subject) in enumerate(
+        tqdm(comparisons, disable=args.disable_tqdm)
+    ):
+        qprefix, qsuffix = (
+            args.outdir / "fragments" / Path(query.path).stem,
+            Path(query.path).suffix,
+        )
+        qfrags = Path(f"{qprefix}-fragments{qsuffix}")
+
+        blastallcmd = aniblastall.generate_blastall_commands(
+            qfrags, Path(subject.path), args.outdir, args.blastall_exe
+        )
+        logger.debug(f"Commands to run:\n\t{blastallcmd}\n")
+        outprefix = blastallcmd.split()[4][:-10]  # prefix for blastall output
+        outfname = Path(outprefix + ".blast_tab")
+        logger.debug(f"Expected output file for db: {outfname}")
+
+        # If we are in recovery mode, we are salvaging output from a previous
+        # run, and do not necessarily need to rerun all the jobs. In this case,
+        # we prepare a list of output files we want to recover from the results
+        # in the output directory.
+
+        # The comparisons collection always gets updated, so that results are
+        # added to the database whether they come from recovery mode or are run
+        # in this call of the script.
+        if args.recovery and outfname in existing_files:
+            logger.debug(f"Recovering output from {outfname}, not submitting job")
+            # Need to track the expected output, but set the job itself to None.
+            joblist.append(
+                ComparisonJob(
+                    query, subject, blastallcmd, outfname, args.fragsize, None
+                )
+            )
+        else:
+            logger.debug("Building job")
+            # Build job
+            blastalljob = pyani_jobs.Job(
+                "{args.jobprefix}_{idx:06d}-blastall", blastallcmd
+            )
+            joblist.append(
+                ComparisonJob(
+                    query, subject, blastallcmd, outfname, args.fragsize, blastalljob
+                )
+            )
+    return joblist
+
+
 def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path, str]:
     """Return path to fragmented sequence file and JSON of fragment lengths.
 

From 1971d25bb9114c722dd64dda881b070dfbcb3d74 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:08:07 +0100
Subject: [PATCH 53/75] Add code to generate `joblist`

---
 pyani/scripts/subcommands/subcmd_aniblastall.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 37b35503..0c3bacd1 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -263,6 +263,15 @@ def subcmd_aniblastall(args: Namespace) -> None:
         existing_files = list()
         logger.debug("\tAssuming no pre-existing output files")
 
+    # Create list of BLASTALL jobs for each comparison still to be performed
+    logger.info("Creating blastall jobs for ANIblastall...")
+    # This method considered, but doesn't get fraglens
+    # fragfiles =  [file for file in fragdir.iterdir()]
+    joblist = generate_joblist(
+        comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args
+    )
+    logger.debug(f"...created {len(joblist)} blastall jobs")
+
 
 def generate_joblist(
     comparisons: List,

From 641a843ccd2662ca3c172ba5fcf825d9cdfc40d8 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:09:20 +0100
Subject: [PATCH 54/75] Add code to run comparisons

---
 .../scripts/subcommands/subcmd_aniblastall.py | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 0c3bacd1..e312afc0 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -272,6 +272,11 @@ def subcmd_aniblastall(args: Namespace) -> None:
     )
     logger.debug(f"...created {len(joblist)} blastall jobs")
 
+    # Pass jobs to appropriate scheduler
+    logger.debug(f"Passing {len(joblist)} jobs to {args.scheduler}")
+    run_aniblastall_jobs(joblist, args)
+    logger.info("...jobs complete.")
+
 
 def generate_joblist(
     comparisons: List,
@@ -371,3 +376,43 @@ def fragment_fasta_file(inpath: Path, outdir: Path, fragsize: int) -> Tuple[Path
     fragpath = outdir / f"{inpath.stem}-fragments.fna"
     SeqIO.write(outseqs, fragpath, "fasta")
     return fragpath, sizedict
+
+
+def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
+    """Pass ANIblastall blastall jobs to the scheduler.
+
+    :param joblist:  list of ComparisonJob namedtuples
+    :param args:     command-line arguments for the run
+    """
+    logger = logging.getLogger(__name__)
+    logger.debug(f"Scheduler: {args.scheduler}")
+
+    # Entries with None seen in recovery mode:
+    jobs = [_.job for _ in joblist if _.job]
+
+    if args.scheduler == "multiprocessing":
+        logger.info("Running jobs with multiprocessing")
+        if not args.workers:
+            logger.debug("(using maximum number of worker threads)")
+        else:
+            logger.debug(f"(using {args.workers} worker threads, if available)")
+        cumval = run_mp.run_dependency_graph(jobs, workers=args.workers)
+        if cumval > 0:
+            logger.error(
+                "At least one blastall comparison failed. Please investigate (exiting)."
+            )
+            raise PyaniException("Multiprocessing run failed in ANIblastall.")
+        logger.info("Multiprocessing run completed without error.")
+    elif args.scheduler.lower() == "sge":
+        logger.info("Running jobs with SGE")
+        logger.debug(f"Setting jobarray group size to {args.sgegroupsize}")
+        logger.debug(f"Joblist contains {len(joblist)}")
+        run_sge.run_dependency_graph(
+            jobs,
+            jgprefix=args.jobprefix,
+            sgegroupsize=args.sgegroupsize,
+            sgeargs=args.sgeargs,
+        )
+    else:
+        logger.error(termcolor(f"Scheduler {args.scheduler} not recognised", "red"))
+        raise SystemError(1)

From d53c4e844fa1444a878132603a51222c628daa91 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:09:40 +0100
Subject: [PATCH 55/75] Add code to update database

---
 .../scripts/subcommands/subcmd_aniblastall.py | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index e312afc0..0b6656c4 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -277,6 +277,16 @@ def subcmd_aniblastall(args: Namespace) -> None:
     run_aniblastall_jobs(joblist, args)
     logger.info("...jobs complete.")
 
+    # Process output and add results to database
+    # This requires us to drop out of threading/multiprocessing: Python's SQLite3
+    # interface doesn't allow sharing connections and cursors
+    logger.info("Adding comparison results to database...")
+    update_comparison_results(joblist, run, session, blastall_version, fraglens, args)
+    update_comparison_matrices(session, run)
+    logger.info("...database updated.")
+
+    # raise NotImplementedError
+
 
 def generate_joblist(
     comparisons: List,
@@ -416,3 +426,55 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
     else:
         logger.error(termcolor(f"Scheduler {args.scheduler} not recognised", "red"))
         raise SystemError(1)
+
+
+def update_comparison_results(
+    joblist: List[ComparisonJob],
+    run,
+    session,
+    blastall_version: str,
+    fraglens: Dict,
+    args: Namespace,
+) -> None:
+    """Update the Comparison table with the completed result set.
+
+    :param joblist:  list of ComparisonJob namedtuples
+    :param run:  Run ORM object for the current ANIblastall run
+    :param session:  active pyanidb session via ORM
+    :param blastall_version:  version of blastall used for the comparison
+    :param fraglens:  dictionary of fragment lengths for each genome
+    :param args:  command-line arguments for this run
+
+    The Comparison table stores individual comparison results, one per row.
+    """
+    logger = logging.getLogger(__name__)
+
+    # Add individual results to Comparison table
+    for job in tqdm(joblist, disable=args.disable_tqdm):
+        logger.debug(f"\t{job.query.description} vs {job.subject.description}")
+        aln_length, sim_errs, ani_pid = aniblastall.parse_blast_tab(
+            job.outfile, fraglens
+        )
+        logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}")
+        logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}")
+        qcov = aln_length / job.query.length
+        scov = aln_length / job.subject.length
+        run.comparisons.append(
+            Comparison(
+                query=job.query,
+                subject=job.subject,
+                aln_length=int(aln_length),
+                sim_errs=int(sim_errs),
+                identity=ani_pid,
+                cov_query=qcov,
+                cov_subject=scov,
+                program="blastall",
+                version=blastall_version,
+                fragsize=job.fragsize,
+                maxmatch=False,
+            )
+        )
+
+    # Populate db
+    logger.debug("Committing results to database")
+    session.commit()

From aba213c07e03d2609b649f59df56ce377ae2e344 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:12:53 +0100
Subject: [PATCH 56/75] Remove or rename things specific to `anib` to
 `aniblastall`

---
 tests/test_aniblastall.py | 67 ++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py
index 5c5c6da6..c3645116 100644
--- a/tests/test_aniblastall.py
+++ b/tests/test_aniblastall.py
@@ -50,72 +50,65 @@
 
 from pandas.util.testing import assert_frame_equal
 
-from pyani import anib, pyani_files
+from pyani import anib, pyani_files  # probably don't need anib
 from pyani import aniblastall
 
 
-class ANIbOutput(NamedTuple):
+class ANIblastallOutput(NamedTuple):
 
-    """Convenience struct for ANIb output."""
+    """Convenience struct for ANIblastall output."""
 
     fragfile: Path
-    tabfile: Path
     legacytabfile: Path
 
 
-class ANIbOutputDir(NamedTuple):
+class ANIblastallOutputDir(NamedTuple):
 
-    """Convenience struct for ANIb output."""
+    """Convenience struct for ANIblastall output."""
 
     infiles: List[Path]
     fragfiles: List[Path]
-    blastdir: Path
     legacyblastdir: Path
-    blastresult: pd.DataFrame
     legacyblastresult: pd.DataFrame
 
 
 @pytest.fixture
-def anib_output(dir_anib_in):
-    """Namedtuple of example ANIb output.
+def aniblastall_output(dir_aniblastall_in):
+    """Namedtuple of example ANIblastall output.
 
     fragfile - fragmented FASTA query file
-    tabfile  - BLAST+ tabular output
     legacytabfile - blastall tabular output
     """
-    return ANIbOutput(
-        dir_anib_in / "NC_002696-fragments.fna",
-        dir_anib_in / "NC_002696_vs_NC_011916.blast_tab",
-        dir_anib_in / "NC_002696_vs_NC_010338.blast_tab",
+    return ANIblastallOutput(
+        dir_aniblastall_in / "NC_002696-fragments.fna",
+        dir_aniblastall_in / "NC_002696_vs_NC_010338.blast_tab",
     )
 
 
 @pytest.fixture
-def anib_output_dir(dir_anib_in):
-    """Namedtuple of example ANIb output - full directory.
+def aniblastall_output_dir(dir_aniblastall_in):
+    """Namedtuple of example ANIblastall output - full directory.
 
     infiles - list of FASTA query files
     fragfiles - list of fragmented FASTA query files
-    blastdir - path to BLAST+ output data
     legacyblastdir - path to blastall output data
-    blastresult - pd.DataFrame result for BLAST+
     legacyblastresult - pd.DataFrame result for blastall
     """
-    return ANIbOutputDir(
+    return ANIblastallOutputDir(
         [
             _
-            for _ in (dir_anib_in / "sequences").iterdir()
+            for _ in (dir_aniblastall_in / "sequences").iterdir()
             if _.is_file() and _.suffix == ".fna"
         ],
         [
             _
-            for _ in (dir_anib_in / "fragfiles").iterdir()
+            for _ in (dir_aniblastall_in / "fragfiles").iterdir()
             if _.is_file() and _.suffix == ".fna"
         ],
-        dir_anib_in / "blastn",
-        dir_anib_in / "blastall",
-        pd.read_csv(dir_anib_in / "dataframes" / "blastn_result.csv", index_col=0),
-        pd.read_csv(dir_anib_in / "dataframes" / "blastall_result.csv", index_col=0),
+        dir_aniblastall_in / "blastall",
+        pd.read_csv(
+            dir_aniblastall_in / "dataframes" / "blastall_result.csv", index_col=0
+        ),
     )
 
 
@@ -186,8 +179,8 @@ def test_blastall_graph(path_fna_all, tmp_path, fragment_length):
 
 
 def test_blastall_multiple(path_fna_two, tmp_path):
-    """Generate legacy BLASTN commands."""
-    cmds = aniblastall.generate_blastn_commands(path_fna_two, tmp_path)
+    """Generate legacy BLASTALL commands."""
+    cmds = aniblastall.generate_blastall_commands(path_fna_two, tmp_path)
     expected = [
         (
             "blastall -p blastn -o "
@@ -208,7 +201,7 @@ def test_blastall_multiple(path_fna_two, tmp_path):
 
 
 def test_blastall_single(path_fna_two, tmp_path):
-    """Generate legacy BLASTN command-line."""
+    """Generate legacy BLASTALL command-line."""
     cmd = aniblastall.construct_blastall_cmdline(
         path_fna_two[0], path_fna_two[1], tmp_path
     )
@@ -246,23 +239,23 @@ def test_formatdb_single(path_fna, tmp_path):
 
 
 # Test output file parsing for ANIb methods
-def test_parse_legacy_blastdir(anib_output_dir):
+def test_parse_legacy_blastdir(aniblastall_output_dir):
     """Parses directory of legacy BLAST output."""
-    orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles)
-    fraglengths = aniblastall.get_fraglength_dict(anib_output_dir.fragfiles)
+    orglengths = pyani_files.get_sequence_lengths(aniblastall_output_dir.infiles)
+    fraglengths = aniblastall.get_fraglength_dict(aniblastall_output_dir.fragfiles)
     result = aniblastall.process_blast(
-        anib_output_dir.legacyblastdir, orglengths, fraglengths
+        aniblastall_output_dir.legacyblastdir, orglengths, fraglengths
     )
     assert_frame_equal(
         result.percentage_identity.sort_index(1).sort_index(),
-        anib_output_dir.legacyblastresult.sort_index(1).sort_index(),
+        aniblastall_output_dir.legacyblastresult.sort_index(1).sort_index(),
     )
 
 
-def test_parse_legacy_blasttab(anib_output):
+def test_parse_legacy_blasttab(aniblastall_output):
     """Parses ANIB legacy .blast_tab output."""
-    fragdata = aniblastall.get_fraglength_dict([anib_output.fragfile])
-    result = aniblastall.parse_blast_tab(anib_output.legacytabfile, fragdata)
+    fragdata = aniblastall.get_fraglength_dict([aniblastall_output.fragfile])
+    result = aniblastall.parse_blast_tab(aniblastall_output.legacytabfile, fragdata)
     assert (
         a == b for a, b in zip(result, [1_966_922, 406_104, 78.578_978_313_253_018])
     )

From 3233fd5c7db5f15389daa162434ac1468f2f4b6d Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:13:15 +0100
Subject: [PATCH 57/75] Skip unnecessary tests

Will probably be removed
---
 tests/test_aniblastall.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py
index c3645116..4ae714b0 100644
--- a/tests/test_aniblastall.py
+++ b/tests/test_aniblastall.py
@@ -178,6 +178,7 @@ def test_blastall_graph(path_fna_all, tmp_path, fragment_length):
         assert job.dependencies[0].script.startswith("formatdb")
 
 
+@pytest.mark.skip(reason="unsure this is needed")
 def test_blastall_multiple(path_fna_two, tmp_path):
     """Generate legacy BLASTALL commands."""
     cmds = aniblastall.generate_blastall_commands(path_fna_two, tmp_path)
@@ -215,6 +216,7 @@ def test_blastall_single(path_fna_two, tmp_path):
 
 
 # Test legacy BLAST database formatting (formatdb) command generation
+@pytest.mark.skip(reason="unsure this is needed")
 def test_formatdb_multiple(path_fna_two, tmp_path):
     """Generate legacy BLAST db creation commands."""
     cmds = aniblastall.generate_blastdb_commands(path_fna_two, tmp_path)

From 92932bae1f81f2530b41b6c8d6f6a54209088a61 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 22 Sep 2021 23:14:05 +0100
Subject: [PATCH 58/75] Add `dir_aniblastall_in()` fixture

---
 tests/conftest.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 61b7c428..48bc4ccc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -117,6 +117,12 @@ def dir_anib_in():
     return FIXTUREPATH / "anib"
 
 
+@pytest.fixture
+def dir_aniblastall_in():
+    """Input files for ANIblastall tests."""
+    return FIXTUREPATH / "anib"
+
+
 @pytest.fixture
 def dir_anim_in():
     """Input files for ANIm tests."""

From 9631001d2f2662a9a1ef85f28ebcf064042a2183 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Thu, 30 Sep 2021 23:39:42 +0100
Subject: [PATCH 59/75] Add lines debugging

May be removed later
---
 pyani/pyani_orm.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pyani/pyani_orm.py b/pyani/pyani_orm.py
index c2378fe4..0a59b0f0 100644
--- a/pyani/pyani_orm.py
+++ b/pyani/pyani_orm.py
@@ -44,6 +44,7 @@
 
 from pathlib import Path
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple
+import logging
 
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@@ -604,6 +605,8 @@ def update_comparison_matrices(session, run) -> None:
     :param session:       active pyanidb session via ORM
     :param run:           Run ORM object for the current ANIm run
     """
+    logger = logging.getLogger(__name__)
+
     # Create dataframes for storing in the Run table
     # Rows and columns are the (ordered) list of genome IDs
     genome_ids = sorted([_.genome_id for _ in run.genomes.all()])
@@ -623,6 +626,9 @@ def update_comparison_matrices(session, run) -> None:
 
     # Loop over all comparisons for the run and fill in result matrices
     for cmp in run.comparisons.all():
+        logger.debug(f"Comparison: {cmp}")
+        logger.debug(f"Alignment length: {cmp.aln_length}")
+        # logger.debug(f"ANI alignment length: {cmp.ani_alnlen}")
         qid, sid = cmp.query_id, cmp.subject_id
         df_identity.loc[qid, sid] = cmp.identity
         df_identity.loc[sid, qid] = cmp.identity
@@ -635,6 +641,7 @@ def update_comparison_matrices(session, run) -> None:
         df_hadamard.loc[qid, sid] = cmp.identity * cmp.cov_query
         df_hadamard.loc[sid, qid] = cmp.identity * cmp.cov_subject
 
+    logger.debug(f"Alnlength df: {df_alnlength.head()}")
     # Add matrices to the database
     run.df_identity = df_identity.to_json()
     run.df_coverage = df_coverage.to_json()

From c7f60cc7422a81534cf9385e73e42b9310948011 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 5 Oct 2021 15:09:35 +0100
Subject: [PATCH 60/75] Fix naming of elements in `test_subcmd_anib" namespace

---
 tests/test_subcmd_08_anib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py
index 68664c47..72664741 100644
--- a/tests/test_subcmd_08_anib.py
+++ b/tests/test_subcmd_08_anib.py
@@ -106,13 +106,13 @@ def setUp(self):
                 outdir=self.dirpaths.outdir,
                 dbpath=self.dbpath,
                 force=False,
-                name="test_subcmd anib",
+                name="test_subcmd_anib",
                 classes=self.lblfiles.classes,
                 labels=self.lblfiles.labels,
                 recovery=False,
                 cmdline="ANIb test suite",
                 blastn_exe=self.exes.blastn_exe,
-                filter_exe=self.exes.format_exe,
+                format_exe=self.exes.format_exe,
                 fragsize=1020,
                 scheduler=self.scheduler,
                 workers=None,

From c17413367e27ff718f6a16dc510f8e802957e447 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 5 Oct 2021 15:11:06 +0100
Subject: [PATCH 61/75] Update naming of `TestANIbSubcommand()`

---
 tests/test_subcmd_08_anib.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py
index 72664741..cac3758b 100644
--- a/tests/test_subcmd_08_anib.py
+++ b/tests/test_subcmd_08_anib.py
@@ -77,8 +77,7 @@
 LabelPaths = namedtuple("LabelPaths", "classes labels")
 
 
-@pytest.mark.xfail(reason="ANIb is not currently fully implemented")
-class TestANIbsubcommand(unittest.TestCase):
+class TestANIbSubcommand(unittest.TestCase):
 
     """Class defining tests of the pyani anib subcommand."""
 

From dc89bcd91a6b3c5d4e3130249b26d5b7380c25db Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 5 Oct 2021 15:11:51 +0100
Subject: [PATCH 62/75] Switch to use `NamedTuple`

---
 tests/test_subcmd_08_anib.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py
index cac3758b..17bbb0ed 100644
--- a/tests/test_subcmd_08_anib.py
+++ b/tests/test_subcmd_08_anib.py
@@ -59,7 +59,7 @@
 import unittest
 
 from argparse import Namespace
-from collections import namedtuple
+from typing import NamedTuple
 from pathlib import Path
 
 import pytest
@@ -68,13 +68,21 @@
 
 
 # Convenience struct with paths to third-party executables
-ThirdPartyExes = namedtuple("ThirdPartyExes", "blastn_exe format_exe")
+class ThirdPartyExes(NamedTuple):
+    blastall_exe: Path
+    format_exe: Path
+
 
 # Convenience struct with paths to working directories
-DirPaths = namedtuple("DirPaths", "indir outdir")
+class DirPaths(NamedTuple):
+    indir: Path
+    outdir: Path
+
 
 # Convenience struct for label/class files
-LabelPaths = namedtuple("LabelPaths", "classes labels")
+class LabelPaths(NamedTuple):
+    classes: Path
+    labels: Path
 
 
 class TestANIbSubcommand(unittest.TestCase):

From 13add41314c3c07cc575fd806f6e4193c7c7061a Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 5 Oct 2021 15:13:18 +0100
Subject: [PATCH 63/75] Add `test_subcmd_10_aniblastall.py`

---
 tests/test_subcmd_10_aniblastall.py | 94 +++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 tests/test_subcmd_10_aniblastall.py

diff --git a/tests/test_subcmd_10_aniblastall.py b/tests/test_subcmd_10_aniblastall.py
new file mode 100644
index 00000000..a282880b
--- /dev/null
+++ b/tests/test_subcmd_10_aniblastall.py
@@ -0,0 +1,94 @@
+"""Test aniblastall subcommand for pyani.
+
+The test suite is intended to be run from the repository root using:
+
+pytest -v
+
+Each command CMD available at the command line as pyani <CMD> is
+tested in its own class as a subclass of unittest.TestCase, where
+setUp() defines input/output files, a null logger (which is also
+picked up by nosetests), and a dictionary of command lines, keyed
+by test name, with values representing command-line options.
+
+For each test, command line options are defined in a Namespace and
+passed as the sole argument to the appropriate subcommand.
+"""
+
+import logging
+import os
+import unittest
+
+from argparse import Namespace
+from typing import NamedTuple
+from pathlib import Path
+
+import pytest
+
+from pyani.scripts import subcommands
+
+
+# Convenience struct with paths to third-party executables
+class ThirdPartyExes(NamedTuple):
+    blastall_exe: Path
+    format_exe: Path
+
+
+# Convenience struct with paths to working directories
+class DirPaths(NamedTuple):
+    indir: Path
+    outdir: Path
+
+
+# Convenience struct for label/class files
+class LabelPaths(NamedTuple):
+    classes: Path
+    labels: Path
+
+
+class TestANIblastallSubcommand(unittest.TestCase):
+
+    """Class defining tests of the pyani aniblastall subcommand."""
+
+    def setUp(self):
+        """Configure parameters for tests."""
+        self.dirpaths = DirPaths(
+            Path("tests/test_input/subcmd_anib"),
+            Path("tests/test_output/subcmd_aniblastall"),
+        )
+        os.makedirs(self.dirpaths.outdir, exist_ok=True)
+        self.dbpath = Path("tests/test_output/subcmd_createdb/pyanidb")
+        self.lblfiles = LabelPaths(
+            self.dirpaths.indir / "classes.txt", self.dirpaths.indir / "labels.txt"
+        )
+        self.exes = ThirdPartyExes("blastall", "formatdb")
+        self.scheduler = "multiprocessing"
+
+        # Null logger instance
+        self.logger = logging.getLogger("TestIndexSubcommand logger")
+        self.logger.addHandler(logging.NullHandler())
+
+        # Command line namespaces
+        self.argsdict = {
+            "aniblastall": Namespace(
+                indir=self.dirpaths.indir,
+                outdir=self.dirpaths.outdir,
+                dbpath=self.dbpath,
+                force=False,
+                name="test_subcmd_aniblastall",
+                classes=self.lblfiles.classes,
+                labels=self.lblfiles.labels,
+                recovery=False,
+                cmdline="ANIblastall test suite",
+                blastall_exe=self.exes.blastall_exe,
+                formatdb_exe=self.exes.format_exe,
+                fragsize=1020,
+                scheduler=self.scheduler,
+                workers=None,
+                disable_tqdm=True,
+                jobprefix="ANIblastallTest",
+            )
+        }
+
+    def test_anib(self):
+        """Test aniblastall run."""
+        subcommands.subcmd_aniblastall(self.argsdict["aniblastall"])

From 7566329a30c91b36dbbbfd73b5fee56a7120d7dd Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 5 Oct 2021 15:15:40 +0100
Subject: [PATCH 64/75] Make `test_subcmd_04_anim.py` match other method
 subcommand tests

---
 tests/test_subcmd_04_anim.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/test_subcmd_04_anim.py b/tests/test_subcmd_04_anim.py
index 7387bdae..e514523f 100644
--- a/tests/test_subcmd_04_anim.py
+++ b/tests/test_subcmd_04_anim.py
@@ -48,20 +48,28 @@
 import unittest
 
 from argparse import Namespace
-from collections import namedtuple
+from typing import NamedTuple
 from pathlib import Path
 
 from pyani.scripts import subcommands
 
 
 # Convenience struct with paths to third-party executables
-ThirdPartyExes = namedtuple("ThirdPartyExes", "nucmer_exe filter_exe")
+class ThirdPartyExes(NamedTuple):
+    nucmer_exe: Path
+    format_exe: Path
+
 
 # Convenience struct with paths to working directories
-DirPaths = namedtuple("DirPaths", "indir outdir")
+class DirPaths(NamedTuple):
+    indir: Path
+    outdir: Path
+
 
 # Convenience struct for label/class files
-LabelPaths = namedtuple("LabelPaths", "classes labels")
+class LabelPaths(NamedTuple):
+    classes: Path
+    labels: Path
 
 
 class TestANImSubcommand(unittest.TestCase):
@@ -94,7 +102,7 @@ def setUp(self):
                 outdir=self.dirpaths.outdir,
                 dbpath=self.dbpath,
                 force=False,
-                name="test_anim",
+                name="test_subcmd_anim",
                 classes=self.lblfiles.classes,
                 labels=self.lblfiles.labels,
                 recovery=False,

From 3c688d407e95582dacf220e0a966079ce35339d0 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Tue, 5 Oct 2021 15:33:33 +0100
Subject: [PATCH 65/75] Fix names of executable variables

---
 tests/test_subcmd_04_anim.py | 2 +-
 tests/test_subcmd_08_anib.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_subcmd_04_anim.py b/tests/test_subcmd_04_anim.py
index e514523f..8863d419 100644
--- a/tests/test_subcmd_04_anim.py
+++ b/tests/test_subcmd_04_anim.py
@@ -57,7 +57,7 @@
 # Convenience struct with paths to third-party executables
 class ThirdPartyExes(NamedTuple):
     nucmer_exe: Path
-    format_exe: Path
+    filter_exe: Path
 
 
 # Convenience struct with paths to working directories
diff --git a/tests/test_subcmd_08_anib.py b/tests/test_subcmd_08_anib.py
index 17bbb0ed..5f9dce5e 100644
--- a/tests/test_subcmd_08_anib.py
+++ b/tests/test_subcmd_08_anib.py
@@ -69,7 +69,7 @@
 
 # Convenience struct with paths to third-party executables
 class ThirdPartyExes(NamedTuple):
-    blastall_exe: Path
+    blastn_exe: Path
     format_exe: Path
 
 

From bc25ccc9348c1760f603a42e19950151147f1523 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Thu, 7 Oct 2021 11:59:39 +0100
Subject: [PATCH 66/75] Pass both stdout and stderr to the regex version search

---
 pyani/anib.py        | 2 +-
 pyani/aniblastall.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyani/anib.py b/pyani/anib.py
index 43101ebc..e711cbe5 100644
--- a/pyani/anib.py
+++ b/pyani/anib.py
@@ -143,7 +143,7 @@ def get_version(blast_exe: Path = pyani_config.BLASTN_DEFAULT) -> str:
         check=True,
     )
     version = re.search(  # type: ignore
-        r"(?<=blastn:\s)[0-9\.]*\+", str(result.stdout, "utf-8")
+        r"(?<=blastn:\s)[0-9\.]*\+", str(result.stdout + result.stderr, "utf-8")
     ).group()
 
     if 0 == len(version.strip()):
diff --git a/pyani/aniblastall.py b/pyani/aniblastall.py
index 6e3f4899..c6349a78 100644
--- a/pyani/aniblastall.py
+++ b/pyani/aniblastall.py
@@ -111,7 +111,7 @@ def get_version(blastall_exe: Path = pyani_config.BLASTALL_DEFAULT) -> str:
         return f"blastall exists at {blastall_path} but could not be executed"
 
     version = re.search(  # type: ignore
-        r"(?<=blastall\s)[0-9\.]*", str(result.stderr, "utf-8")
+        r"(?<=blastall\s)[0-9\.]*", str(result.stderr + result.stdout, "utf-8")
     ).group()
 
     if 0 == len(version.strip()):

From a313d2f2bce97df9894761ed8071033260af0bff Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Thu, 7 Oct 2021 12:48:25 +0100
Subject: [PATCH 67/75] Set `indir` and `outdir` to required in
 `anib_parser.py`

---
 pyani/scripts/parsers/anib_parser.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyani/scripts/parsers/anib_parser.py b/pyani/scripts/parsers/anib_parser.py
index b2b99785..61e0b53c 100644
--- a/pyani/scripts/parsers/anib_parser.py
+++ b/pyani/scripts/parsers/anib_parser.py
@@ -68,7 +68,8 @@ def build(
         "--indir",
         action="store",
         dest="indir",
-        default=None,
+        # default=None,
+        required=True,
         type=Path,
         help="input genome directory",
     )
@@ -77,7 +78,8 @@ def build(
         "--outdir",
         action="store",
         dest="outdir",
-        default=None,
+        # default=None,
+        required=True,
         type=Path,
         help="output analysis results directory",
     )

From 3ccd1d9cba39cbd9f02c87ba7e680c944f0039f0 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Thu, 7 Oct 2021 13:16:52 +0100
Subject: [PATCH 68/75] Add `docs/subcmd_anib.rst`

---
 docs/subcmd_anib.rst | 75 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/docs/subcmd_anib.rst b/docs/subcmd_anib.rst
index 25b6a9c2..82e18f77 100644
--- a/docs/subcmd_anib.rst
+++ b/docs/subcmd_anib.rst
@@ -3,3 +3,78 @@
 ==============
 ``pyani anib``
 ==============
+
+The ``anib`` subcommand will carry out ANIb analysis using genome files contained in the ``indir`` directory, writing result files to the ``outdir`` directory, and recording data about each comparison and run in a local `SQLite3`_ database.
+
+.. code-block:: text
+
+    usage: pyani.py anib [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm] [--version]
+                     [--citation] [--scheduler {multiprocessing,SGE}]
+                     [--workers WORKERS] [--SGEgroupsize SGEGROUPSIZE]
+                     [--SGEargs SGEARGS] [--jobprefix JOBPREFIX] [--name NAME]
+                     [--classes CLASSES] [--labels LABELS] [--recovery] -i INDIR -o
+                     OUTDIR [--dbpath DBPATH] [--blastn_exe BLASTN_EXE]
+                     [--format_exe FORMAT_EXE] [--fragsize FRAGSIZE]
+
+.. _SQLite3: https://www.sqlite.org/index.html
+
+-----------------
+Flagged arguments
+-----------------
+
+``--blastn_exe BLASTN_EXE``
+    Path to the ``blastn`` executable. Default: ``blastn``
+
+``--classes CLASSFNAME``
+    Use the set of classes (one per genome sequence file) found in the file ``CLASSFNAME`` in ``INDIR``. Default: ``classes.txt``
+
+``--dbpath DBPATH``
+    Path to the location of the local ``pyani`` database to be used. Default: ``.pyani/pyanidb``
+
+``--disable_tqdm``
+    Disable the ``tqdm`` progress bar while the ANIb process runs. This is useful when testing to avoid aesthetic problems with test output.
+
+``--format_exe FORMAT_EXE``
+    Path to the ``blastn`` executable. Default: ``makeblastdb``
+
+``--fragsize FRAGSIZE``
+    Fragment size to use in analysis. (default: 1020)
+
+``-h, --help``
+    Display usage information for ``pyani anib``.
+
+``-i INDIR, --input INDIR``
+    Path to the directory containing indexed genome files to be used for the analysis.
+
+``--jobprefix JOBPREFIX``
+    Use the string ``JOBPREFIX`` as a prefix for SGE job submission names. Default: ``PYANI``
+
+``--labels LABELFNAME``
+    Use the set of labels (one per genome sequence file) found in the file ``LABELFNAME`` in ``INDIR``. Default: ``labels.txt``
+
+``-l LOGFILE, --logfile LOGFILE``
+    Provide the location ``LOGFILE`` to which a logfile of the ANIb process will be written.
+
+``--name NAME``
+    Use the string ``NAME`` to identify this ``ANIb`` run in the ``pyani`` database.
+
+``-o OUTDIR, --outdir OUTDIR``
+    Path to a directory where comparison output files will be written.
+
+``--recovery``
+    Use existing ``ANIb`` comparison output if available, e.g. if recovering from a failed job submission. Using this option will not generate a new comparison if the old output files exist.
+
+``--scheduler {multiprocessing, SGE}``
+    Specify the job scheduler to be used when parallelising genome comparisons: one of ``multiprocessing`` (use many cores on the current machine)  or ``SGE`` (use an SGE or OGE job scheduler). Default: ``multiprocessing``.
+
+``--SGEargs SGEARGS``
+    Pass additional arguments ``SGEARGS`` to ``qsub`` when running the SGE-distributed jobs.
+
+``--SGEgroupsize SGEGROUPSIZE``
+    Create SGE arrays containing SGEGROUPSIZE comparison jobs. Default: 10000
+
+``-v, --verbose``
+    Provide verbose output to ``STDOUT``.
+
+``--workers WORKERS``
+    Spawn WORKERS worker processes with the ``--scheduler multiprocessing`` option. Default: 0 (use all cores)

From f423dbe3ce48c8cbc1e26ebe0a097980be8735f8 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Thu, 7 Oct 2021 13:17:39 +0100
Subject: [PATCH 69/75] Add documentation for `aniblastall`

---
 docs/run_aniblastall.rst    | 121 ++++++++++++++++++++++++++++++++++++
 docs/subcmd_aniblastall.rst |  80 ++++++++++++++++++++++++
 docs/subcommands.rst        |   1 +
 3 files changed, 202 insertions(+)
 create mode 100644 docs/run_aniblastall.rst
 create mode 100644 docs/subcmd_aniblastall.rst

diff --git a/docs/run_aniblastall.rst b/docs/run_aniblastall.rst
new file mode 100644
index 00000000..cb5884d7
--- /dev/null
+++ b/docs/run_aniblastall.rst
@@ -0,0 +1,121 @@
+.. _pyani-run_aniblastall:
+
+=====================
+Running ANIblastall analysis
+=====================
+
+``pyani`` implements average nucleotide identity analysis using `NCBI-BLAST`_ (*ANIblastall*) as defined in Goris `et al.` (2007) (`doi:10.1099/ijs.0.64483-0`_). To run ANIblastall on a set of input genomes, use the ``pyani aniblastall`` subcommand.
+
+In brief, the analysis proceeds as follows for a set of input prokaryotic genomes:
+
+1. Each input genome is fragmented into consecutive sequences of a given size (default: 1020bp)
+2. A new ``BLAST`` database is built from each input genome sequence
+3. `NCBI-BLAST`_ is used to perform pairwise comparisons of each input genome fragment set against the databases for each other input genome, to identify homologous (alignable) regions.
+4. For each comparison, the alignment output is parsed, and the following values are calculated:
+
+    - total number of aligned bases on each genome
+    - fraction of each genome that is aligned (the *coverage*)
+    - the proportion of all aligned regions that is identical in each genome (the *ANI*)
+    - the number of unaligned or non-identical bases (the *similarity errors*)
+    - the product of *coverage* and *ANI*
+
+The output values are recorded in the ``pyani`` database.
+
+.. NOTE::
+    The `NCBI-BLAST`_ comparisons are asymmetric, and performed in both directions for a pair of genomes (i.e. "fragmented A vs complete B" and "fragmented B vs complete A").
+
+.. TIP::
+    The `NCBI-BLAST`_ comparisons are embarrasingly parallel, and can be distributed across cores on an `Open Grid Scheduler`_-compatible cluster, using the ``--scheduler SGE`` option.
+
+.. ATTENTION::
+    ``pyani aniblastall`` requires that a working copy of `NCBI-BLAST`_ is available. Please see :ref:`pyani-installation` for information about installing this package.
+
+For more information about the ``pyani aniblastall`` subcommand, please see the :ref:`pyani-subcmd-aniblastall` page, or issue the command ``pyani aniblastall -h`` to see the inline help.
+
+---------------------
+Perform ANIblastall analysis
+---------------------
+
+The basic form of the command is:
+
+.. code-block:: bash
+
+    pyani aniblastall -i <INPUT_DIRECTORY> -o <OUTPUT_DIRECTORY>
+
+This instructs ``pyani`` to perform ANIblastall on the genome FASTA files in ``<INPUT_DIRECTORY>``, and write any output files to ``<OUTPUT_DIRECTORY>``. For example, the following command performs ANIblastall on genomes in the directory ``genomes`` and writes output to a new directory ``genomes_ANIblastall``:
+
+.. code-block:: bash
+
+    pyani aniblastall -i genomes -o genomes_ANIblastall
+
+.. NOTE::
+    While running, ``pyani aniblastall`` will show progress bars unless these are disabled with the option ``--disable_tqdm``
+
+This command will write the intermediate `NCBI-BLAST`_ output to the directory ``genomes_ANIblastall``, where the results can be inspected if required.
+
+..
+  I am unsure if this is relevant for aniblastall
+  .. code-block:: bash
+
+    $ ls genomes_ANIblastall/
+    nucmer_output
+
+.. ATTENTION::
+    To view the output ANIblastall results, you will need to use the ``pyani report`` or ``pyani plot`` subcommands. Please see :ref:`pyani-subcmd-report` and :ref:`pyani-subcmd-plot` for more details.
+
+----------------------------------------------
+Perform ANIblastall analysis with Open Grid Scheduler
+----------------------------------------------
+
+The `NCBI-BLAST`_ comparisons are embarrasingly parallel, and these jobs can be distributed across cores in a cluster using the `Open Grid Scheduler`_. To enable this during the analysis, use the ``--scheduler SGE`` option:
+
+.. code-block:: bash
+
+    pyani aniblastall --scheduler SGE -i genomes -o genomes_ANIblastall
+
+.. NOTE::
+    Jobs are submitted as *array jobs* to keep the scheduler queue short.
+
+.. NOTE::
+    If ``--scheduler SGE`` is not specified, all `NCBI-BLAST`_ jobs are run locally with ``Python``'s ``multiprocessing`` module.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Controlling parameters of Open Grid Scheduler
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It is possible to control the following features of `Open Grid Scheduler`_ `via` the ``pyani aniblastall`` subcommand:
+
+- The array job size (by default, comparison jobs are batched in arrays of 10,000)
+- The prefix string for the job, as reported in the scheduler queue
+- Arguments to the ``qsub`` job submission command
+
+These allow for useful control of job execution. For example, the command:
+
+.. code-block:: bash
+
+    pyani aniblastall --scheduler SGE --SGEgroupsize 5000 -i genomes -o genomes_ANIblastall
+
+will batch ``ANIblastall`` jobs in groups of 500 for the scheduler. The command:
+
+.. code-block:: bash
+
+    pyani aniblastall --scheduler SGE --jobprefix My_Ace_Job -i genomes -o genomes_ANIblastall
+
+will prepend the string ``My_Ace_Job`` to your job in the scheduler queue. And the command:
+
+.. code-block:: bash
+
+    pyani aniblastall --scheduler SGE --SGEargs "-m e -M my.name@my.domain" --SGEgroupsize 5000 -i genomes -o genomes_ANIblastall
+
+will email ``my.name@my.domain`` when the jobs finish.
+
+
+----------
+References
+----------
+
+- Goris`et al.` (2007) `Int J Syst Evol Micr` _57_: 81-91. `doi:10.1099/ijs.0.64483-0`.
+
+.. _doi:10.1099/ijs.0.64483-0: https://dx.doi.org/10.1099/ijs.0.64483-0
+.. _NCBI-BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download
+.. _Open Grid Scheduler: http://gridscheduler.sourceforge.net/
diff --git a/docs/subcmd_aniblastall.rst b/docs/subcmd_aniblastall.rst
new file mode 100644
index 00000000..c4f472c7
--- /dev/null
+++ b/docs/subcmd_aniblastall.rst
@@ -0,0 +1,80 @@
+.. _pyani-subcmd-aniblastall:
+
+==============
+``pyani aniblastall``
+==============
+
+The ``aniblastall`` subcommand will carry out ANIb analysis using genome files contained in the ``indir`` directory, writing result files to the ``outdir`` directory, and recording data about each comparison and run in a local `SQLite3`_ database.
+
+.. code-block:: text
+
+    usage: pyani.py aniblastall [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm] [--version]
+                     [--citation] [--scheduler {multiprocessing,SGE}]
+                     [--workers WORKERS] [--SGEgroupsize SGEGROUPSIZE]
+                     [--SGEargs SGEARGS] [--jobprefix JOBPREFIX] [--name NAME]
+                     [--classes CLASSES] [--labels LABELS] [--recovery] -i INDIR -o
+                     OUTDIR [--dbpath DBPATH] [--blastall_exe BLASTALL_EXE]
+                     [--format_exe FORMAT_EXE] [--fragsize FRAGSIZE]
+
+.. _SQLite3: https://www.sqlite.org/index.html
+
+-----------------
+Flagged arguments
+-----------------
+
+``--blastall_exe BLASTALL_EXE``
+    Path to the ``blastall`` executable. Default: ``blastall``
+
+``--classes CLASSFNAME``
+    Use the set of classes (one per genome sequence file) found in the file ``CLASSFNAME`` in ``INDIR``. Default: ``classes.txt``
+
+``--dbpath DBPATH``
+    Path to the location of the local ``pyani`` database to be used. Default: ``.pyani/pyanidb``
+
+``--disable_tqdm``
+    Disable the ``tqdm`` progress bar while the ANIblastall process runs. This is useful when testing to avoid aesthetic problems with test output.
+
+``--format_exe FORMAT_EXE``
+    Path to the ``blastall`` executable. Default: ``formatdb``
+
+``--fragsize FRAGSIZE``
+    Fragment size to use in analysis. (default: 1020)
+
+``-h, --help``
+    Display usage information for ``pyani aniblastall``.
+
+``-i INDIR, --input INDIR``
+    Path to the directory containing indexed genome files to be used for the analysis.
+
+``--jobprefix JOBPREFIX``
+    Use the string ``JOBPREFIX`` as a prefix for SGE job submission names. Default: ``PYANI``
+
+``--labels LABELFNAME``
+    Use the set of labels (one per genome sequence file) found in the file ``LABELFNAME`` in ``INDIR``. Default: ``labels.txt``
+
+``-l LOGFILE, --logfile LOGFILE``
+    Provide the location ``LOGFILE`` to which a logfile of the ANIb process will be written.
+
+``--name NAME``
+    Use the string ``NAME`` to identify this ``ANIblastall`` run in the ``pyani`` database.
+
+``-o OUTDIR, --outdir OUTDIR``
+    Path to a directory where comparison output files will be written.
+
+``--recovery``
+    Use existing ``ANIblastall`` comparison output if available, e.g. if recovering from a failed job submission. Using this option will not generate a new comparison if the old output files exist.
+
+``--scheduler {multiprocessing, SGE}``
+    Specify the job scheduler to be used when parallelising genome comparisons: one of ``multiprocessing`` (use many cores on the current machine)  or ``SGE`` (use an SGE or OGE job scheduler). Default: ``multiprocessing``.
+
+``--SGEargs SGEARGS``
+    Pass additional arguments ``SGEARGS`` to ``qsub`` when running the SGE-distributed jobs.
+
+``--SGEgroupsize SGEGROUPSIZE``
+    Create SGE arrays containing SGEGROUPSIZE comparison jobs. Default: 10000
+
+``-v, --verbose``
+    Provide verbose output to ``STDOUT``.
+
+``--workers WORKERS``
+    Spawn WORKERS worker processes with the ``--scheduler multiprocessing`` option. Default: 0 (use all cores)
diff --git a/docs/subcommands.rst b/docs/subcommands.rst
index fa29f872..e169383a 100644
--- a/docs/subcommands.rst
+++ b/docs/subcommands.rst
@@ -22,6 +22,7 @@ This document links out to detailed instructions for each of the ``pyani`` subco
     subcmd_createdb
     subcmd_anim
     subcmd_anib
+    subcmd_aniblastall
     subcmd_report
     subcmd_plot
     subcmd_classify

From 8c40113634b509c4697ccbc615b180dc45b11b87 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Sun, 12 Dec 2021 21:06:44 +0000
Subject: [PATCH 70/75] Replace f-strings in logging statements

---
 pyani/scripts/subcommands/subcmd_anib.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_anib.py b/pyani/scripts/subcommands/subcmd_anib.py
index 991aa765..2d8f3fd4 100644
--- a/pyani/scripts/subcommands/subcmd_anib.py
+++ b/pyani/scripts/subcommands/subcmd_anib.py
@@ -179,7 +179,7 @@ def subcmd_anib(args: Namespace) -> None:
         os.makedirs(args.outdir, exist_ok=True)
     except IOError:
         logger.error(
-            f"Could not create output directory {args.outdir} (exiting)", exc_info=True
+            "Could not create output directory %s (exiting)", args.outdir, exc_info=True
         )
         raise SystemError(1)
     fragdir = Path(str(args.outdir)) / "fragments"
@@ -231,7 +231,7 @@ def subcmd_anib(args: Namespace) -> None:
         session, run, comparisons, "blastn", blastn_version, args.fragsize, False
     )
     logger.info(
-        f"\t...after check, still need to run {len(comparisons_to_run)} comparisons"
+        "\t...after check, still need to run %s comparisons", len(comparisons_to_run)
     )
 
     # If there are no comparisons to run, update the Run matrices and exit
@@ -291,7 +291,7 @@ def subcmd_anib(args: Namespace) -> None:
         comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args
     )
     logger.debug("...created %s blastn jobs", len(joblist))
-    # print(f"Joblist: {joblist}")
+
     # Pass jobs to appropriate scheduler
     logger.debug("Passing %s jobs to %s...", len(joblist), args.scheduler)
     run_anib_jobs(joblist, args)
@@ -305,8 +305,6 @@ def subcmd_anib(args: Namespace) -> None:
     update_comparison_matrices(session, run)
     logger.info("...database updated.")
 
-    # raise NotImplementedError
-
 
 def generate_joblist(
     comparisons: List,
@@ -472,8 +470,7 @@ def update_comparison_results(
     for job in tqdm(joblist, disable=args.disable_tqdm):
         logger.debug("\t%s vs %s", job.query.description, job.subject.description)
         aln_length, sim_errs, ani_pid = anib.parse_blast_tab(job.outfile, fraglens)
-        logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}")
-        logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}")
+
         qcov = aln_length / job.query.length
         scov = aln_length / job.subject.length
         run.comparisons.append(

From 0596e7776995a8c3be9a3033811f8d2399211067 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 13 Apr 2022 16:28:08 +0100
Subject: [PATCH 71/75] Update .gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 1bb86dd5..4c2608e5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Scratch directory for local testing
 scratch/
+makefile_template
 
 # Mac-related dreck
 .DS_Store
@@ -72,4 +73,4 @@ venv-*
 
 # Extra documentation output
 classes_pyani.pdf
-packages_pyani.pdf
\ No newline at end of file
+packages_pyani.pdf

From 6b7b52f38d92eb0bac4258d8ffbf550a62340359 Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 13 Apr 2022 17:35:13 +0100
Subject: [PATCH 72/75] Update call to `add_run()` to fit the function's new
 return value

---
 pyani/scripts/subcommands/subcmd_aniblastall.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index 0b6656c4..c3e4e620 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -132,7 +132,7 @@ def subcmd_aniblastall(args: Namespace) -> None:
     # Add information about this run to the database
     logger.debug(f"Adding run info to database {args.dbpath}...")
     try:
-        run = add_run(
+        run, run_id = add_run(
             session,
             method="ANIblastall",
             cmdline=args.cmdline,
@@ -143,20 +143,20 @@ def subcmd_aniblastall(args: Namespace) -> None:
     except PyaniORMException:
         logger.error("Could not add run to the database (exiting)", exc_info=True)
         raise SystemExit(1)
-    logger.debug(f"\t...added run ID: {run} to the database")
+    logger.debug(f"\t...added run ID: {run_id} to the database")
 
     # Identify input files for comparison, and populate the database
-    logger.debug(f"Adding files for {run} to database...")
+    logger.debug(f"Adding files for run {run_id} to database...")
     try:
         genome_ids = add_run_genomes(
             session, run, args.indir, args.classes, args.labels
         )
     except PyaniORMException:
         logger.error(
-            f"Could not add genomes to database for run {run} (exiting)", exc_info=True
+            "Could not add genomes to database for run (exiting)", exc_info=True
         )
         raise SystemExit(1)
-    logger.debug(f"\t...added gnome IDs: {genome_ids}")
+    logger.debug(f"\t...added genome IDs for run {run_id}: {genome_ids}")
 
     # Get list of genomes for this anlaysis from the database
     logger.info("Compiling genomes for comparison")

From 23d951fafd839448494413b20c3b1a7492d5eacb Mon Sep 17 00:00:00 2001
From: baileythegreen <baileythegreen@gmail.com>
Date: Wed, 13 Apr 2022 17:36:04 +0100
Subject: [PATCH 73/75] Remove f-strings from logging calls

---
 .../scripts/subcommands/subcmd_aniblastall.py | 56 +++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/pyani/scripts/subcommands/subcmd_aniblastall.py b/pyani/scripts/subcommands/subcmd_aniblastall.py
index c3e4e620..68d2a0d2 100644
--- a/pyani/scripts/subcommands/subcmd_aniblastall.py
+++ b/pyani/scripts/subcommands/subcmd_aniblastall.py
@@ -112,25 +112,25 @@ def subcmd_aniblastall(args: Namespace) -> None:
 
     # Get BLASTALL version - this will be used in the database entreis
     blastall_version = aniblastall.get_version(args.blastall_exe)
-    logger.info(termcolor(f"BLAST+ blastall version: {blastall_version}", "cyan"))
+    logger.info(termcolor("BLAST+ blastall version: %s", blastall_version, "cyan"))
 
     # Use provided name, or make new one for this analysis
     start_time = datetime.datetime.now()
     name = args.name or "_".join(["ANIblastall", start_time.isoformat()])
-    logger.info(termcolor(f"Analysis name: {name}", "cyan"))
+    logger.info(termcolor("Analysis name: %s", name, "cyan"))
 
     # Connect to existing database (which may be "clean" or have old analyses)
-    logger.debug(f"Connecting to database {args.dbpath}")
+    logger.debug("Connecting to database %s", args.dbpath)
     try:
         session = get_session(args.dbpath)
     except Exception:
         logger.error(
-            f"Could not connect to database {args.dbpath} (exiting)", exc_info=True
+            "Could not connect to database %s (exiting)", args.dbpath, exc_info=True
         )
         raise SystemExit(1)
 
     # Add information about this run to the database
-    logger.debug(f"Adding run info to database {args.dbpath}...")
+    logger.debug("Adding run info to database %s...", args.dbpath)
     try:
         run, run_id = add_run(
             session,
@@ -143,10 +143,10 @@ def subcmd_aniblastall(args: Namespace) -> None:
     except PyaniORMException:
         logger.error("Could not add run to the database (exiting)", exc_info=True)
         raise SystemExit(1)
-    logger.debug(f"\t...added run ID: {run_id} to the database")
+    logger.debug("\t...added run ID: %s to the database", run_id)
 
     # Identify input files for comparison, and populate the database
-    logger.debug(f"Adding files for run {run_id} to database...")
+    logger.debug("Adding files for run %s to database...", run_id)
     try:
         genome_ids = add_run_genomes(
             session, run, args.indir, args.classes, args.labels
@@ -156,21 +156,21 @@ def subcmd_aniblastall(args: Namespace) -> None:
             "Could not add genomes to database for run (exiting)", exc_info=True
         )
         raise SystemExit(1)
-    logger.debug(f"\t...added genome IDs for run {run_id}: {genome_ids}")
+    logger.debug("\t...added genome IDs for run %s: %s", run_id, genome_ids)
 
     # Get list of genomes for this anlaysis from the database
     logger.info("Compiling genomes for comparison")
     genomes = run.genomes.all()
-    logger.debug(f"\tCollected {len(genomes)} genomes for this run")
+    logger.debug("\tCollected %s genomes for this run", len(genomes))
 
     # Create output directories. We create the amin parent directory (args.outdir), but
     # also subdirectories for the BLAST databases.
-    logger.debug(f"Creating output directory {args.outdir}")
+    logger.debug("Creating output directory %s", args.outdir)
     try:
         os.makedirs(args.outdir, exist_ok=True)
     except IOError:
         logger.error(
-            f"Could not create output directory {args.outdir} (exiting)", exc_info=True
+            "Could not create output directory %s (exiting)", args.outdir, exc_info=True
         )
         raise SystemError(1)
     fragdir = Path(str(args.outdir)) / "fragments"
@@ -188,7 +188,6 @@ def subcmd_aniblastall(args: Namespace) -> None:
         fragpath, fragsizes = fragment_fasta_file(
             Path(str(genome.path)), Path(str(fragdir)), args.fragsize
         )
-        logger.info(f"fragsizes: {type(fragsizes)}")
         fragfiles.update({Path(genome.path).stem: fragpath})
         fraglens.update({Path(genome.path).stem: fragsizes})
 
@@ -216,7 +215,7 @@ def subcmd_aniblastall(args: Namespace) -> None:
         "Compiling pairwise comparisons (this can take time for large datasets)..."
     )
     comparisons = list(permutations(tqdm(genomes, disable=args.disable_tqdm), 2))
-    logger.info(f"\t...total pairwise comparisons to be performed: {len(comparisons)}")
+    logger.info("\t...total pairwise comparisons to be performed: %s", len(comparisons))
 
     # Check for existing comparisons; if one has already been done (for the same
     # software package, version, and setting) we add the comparison to this run,
@@ -226,7 +225,7 @@ def subcmd_aniblastall(args: Namespace) -> None:
         session, run, comparisons, "blastall", blastall_version, args.fragsize, False
     )
     logger.info(
-        f"\t...after check, still need to run {len(comparisons_to_run)} comparisons"
+        "\t...after check, still need to run %s comparisons", len(comparisons_to_run)
     )
 
     # If there are no comparisons to run, update the Run matrices and exit
@@ -250,12 +249,13 @@ def subcmd_aniblastall(args: Namespace) -> None:
     if args.recovery:
         logger.warning("Entering recovery mode...")
         logger.debug(
-            f"\tIn this mode, existing comparison output from {args.outdir} is reused"
+            "\tIn this mode, existing comparison output from %s is reused", args.outdir
         )
         existing_files = collect_existing_output(args.outdir, "blastall", args)
         if existing_files:
             logger.debug(
-                f"\tIdentified {len(existing_files)} existing output files for reuse, existing_files[0] (et cetera)"
+                "\tIdentified %s existing output files for reuse, existing_files[0] (et cetera)",
+                len(existing_files),
             )
         else:
             logger.debug("\tIdentified no existing output files")
@@ -270,10 +270,10 @@ def subcmd_aniblastall(args: Namespace) -> None:
     joblist = generate_joblist(
         comparisons_to_run, existing_files, fragfiles.values(), fraglens.values(), args
     )
-    logger.debug(f"...created {len(joblist)} blastall jobs")
+    logger.debug("...created %s blastall jobs", len(joblist))
 
     # Pass jobs to appropriate scheduler
-    logger.debug(f"Passing {len(joblist)} jobs to {args.scheduler}")
+    logger.debug("Passing %s jobs to %s", len(joblist), args.scheduler)
     run_aniblastall_jobs(joblist, args)
     logger.info("...jobs complete.")
 
@@ -320,10 +320,10 @@ def generate_joblist(
         blastallcmd = aniblastall.generate_blastall_commands(
             qfrags, Path(subject.path), args.outdir, args.blastall_exe
         )
-        logger.debug(f"Commands to run:\n\t{blastallcmd}\n")
+        logger.debug("Commands to run:\n\t%s\n", blastallcmd)
         outprefix = blastallcmd.split()[4][:-10]  # prefix for blastall output
         outfname = Path(outprefix + ".blast_tab")
-        logger.debug(f"Expected output file for db: {outfname}")
+        logger.debug("Expected output file for db: %s", outfname)
 
         # If we are in recovery mode, we are salvaging output from a previous
         # run, and do not necessarily need to rerun all the jobs. In this case,
@@ -334,7 +334,7 @@ def generate_joblist(
         # added to the database whether they come from recovery mode or are run
         # in this call of the script.
         if args.recovery and outfname in existing_files:
-            logger.debug(f"Recovering output from {outfname}, not submitting job")
+            logger.debug("Recovering output from %s, not submitting job", outfname)
             # Need to track the expected output, but set the job itself to None.
             joblist.append(
                 ComparisonJob(
@@ -395,7 +395,7 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
     :param args:     command-line arguments for the run
     """
     logger = logging.getLogger(__name__)
-    logger.debug(f"Scheduler: {args.scheduler}")
+    logger.debug("Scheduler: %s", args.scheduler)
 
     # Entries with None seen in recovery mode:
     jobs = [_.job for _ in joblist if _.job]
@@ -405,7 +405,7 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
         if not args.workers:
             logger.debug("(using maximum number of worker threads)")
         else:
-            logger.debug(f"(using {args.workers} worker threads, if available)")
+            logger.debug("(using %s worker threads, if available)", args.workers)
         cumval = run_mp.run_dependency_graph(jobs, workers=args.workers)
         if cumval > 0:
             logger.error(
@@ -415,8 +415,8 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
         logger.info("Multiprocessing run completed without error.")
     elif args.scheduler.lower() == "sge":
         logger.info("Running jobs with SGE")
-        logger.debug(f"Setting jobarray group size to {args.sgegroupsize}")
-        logger.debug(f"Joblist contains {len(joblist)}")
+        logger.debug("Setting jobarray group size to %s", args.sgegroupsize)
+        logger.debug("Joblist contains %s", len(joblist))
         run_sge.run_dependency_graph(
             jobs,
             jgprefix=args.jobprefix,
@@ -424,7 +424,7 @@ def run_aniblastall_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
             sgeargs=args.sgeargs,
         )
     else:
-        logger.error(termcolor(f"Scheduler {args.scheduler} not recognised", "red"))
+        logger.error(termcolor("Scheduler %s not recognised", args.scheduler, "red"))
         raise SystemError(1)
 
 
@@ -451,12 +451,10 @@ def update_comparison_results(
 
     # Add individual results to Comparison table
     for job in tqdm(joblist, disable=args.disable_tqdm):
-        logger.debug(f"\t{job.query.description} vs {job.subject.description}")
+        logger.debug("\t%s vs %s", job.query.description, job.subject.description)
         aln_length, sim_errs, ani_pid = aniblastall.parse_blast_tab(
             job.outfile, fraglens
         )
-        logger.debug(f"Results: {aln_length}, {sim_errs}, {ani_pid}")
-        logger.debug(f"Results: {type(aln_length)}, {type(sim_errs)}, {type(ani_pid)}")
         qcov = aln_length / job.query.length
         scov = aln_length / job.subject.length
         run.comparisons.append(

From ffba9022163894c9120b4b15919e42c802c1e332 Mon Sep 17 00:00:00 2001
From: Leighton Pritchard <leightonpritchard@mac.com>
Date: Tue, 7 Jun 2022 18:14:56 +0100
Subject: [PATCH 74/75] update deprecated behaviours in pandas

- the sort_index() call now needs axis as an explicit keyword
- pandas.utils.testing is now pandas.testing
---
 tests/test_aniblastall.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py
index 5a8a0794..b04beb71 100644
--- a/tests/test_aniblastall.py
+++ b/tests/test_aniblastall.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-# (c) University of Strathclyde 2019-2021
+# (c) University of Strathclyde 2019-2022
 # Author: Bailey Harrington
 #
 # Contact:
@@ -16,7 +16,7 @@
 #
 # The MIT License
 #
-# Copyright (c) 2019-2021 University of Strathclyde
+# Copyright (c) 2019-2022 University of Strathclyde
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -49,7 +49,7 @@
 import pytest  # noqa: F401  # pylint: disable=unused-import
 import unittest
 
-from pandas.util.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal
 
 from pyani import anib, pyani_files  # probably don't need anib
 from pyani import aniblastall
@@ -263,8 +263,8 @@ def test_parse_legacy_blastdir(aniblastall_output_dir):
         aniblastall_output_dir.legacyblastdir, orglengths, fraglengths
     )
     assert_frame_equal(
-        result.percentage_identity.sort_index(1).sort_index(),
-        aniblastall_output_dir.legacyblastresult.sort_index(1).sort_index(),
+        result.percentage_identity.sort_index(axis=1).sort_index(),
+        aniblastall_output_dir.legacyblastresult.sort_index(axis=1).sort_index(),
     )
 
 

From bdefef028a338689d9c9595307ec622e0498bdbe Mon Sep 17 00:00:00 2001
From: Leighton Pritchard <leightonpritchard@mac.com>
Date: Tue, 7 Jun 2022 18:31:08 +0100
Subject: [PATCH 75/75] remove unused import

If an import is no longer required, it should be removed, rather
than commented as "probably not required" or similar.
---
 tests/test_aniblastall.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_aniblastall.py b/tests/test_aniblastall.py
index b04beb71..e534c9c4 100644
--- a/tests/test_aniblastall.py
+++ b/tests/test_aniblastall.py
@@ -51,7 +51,7 @@
 
 from pandas.testing import assert_frame_equal
 
-from pyani import anib, pyani_files  # probably don't need anib
+from pyani import pyani_files
 from pyani import aniblastall
 
 # Create object for accessing unittest assertions