Skip to content

Commit

Permalink
add command to suggest specificity thresholds
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Aug 23, 2024
1 parent aa11e97 commit 14090cb
Show file tree
Hide file tree
Showing 8 changed files with 556 additions and 3 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@

# Changelog

## v1.0.1 rpetit3/camlhmp "" 2024/08/??

### `Added`

- `camlhmp-blast-thresholds` command to suggest specificity thresholds for BLAST results
- `print_versions` to print versions from multiple schemas
- `print_camlhmp_version` to print the camlhmp version

## v1.0.0 rpetit3/camlhmp "Dromedary" 2024/08/15

### `Added`
Expand Down
2 changes: 1 addition & 1 deletion camlhmp/cli/blast/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def camlhmp_blast_regions(
# Describe the command line arguments
console = rich.console.Console(stderr=True)
print(
"[italic]Running [deep_sky_blue1]camlhmp[/deep_sky_blue1] with following parameters:[/italic]",
"[italic]Running [deep_sky_blue1]camlhmp-blast-regions[/deep_sky_blue1] with following parameters:[/italic]",
file=sys.stderr,
)
print(f"[italic] --input {input}[/italic]", file=sys.stderr)
Expand Down
2 changes: 1 addition & 1 deletion camlhmp/cli/blast/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def camlhmp_blast_targets(
# Describe the command line arguments
console = rich.console.Console(stderr=True)
print(
"[italic]Running [deep_sky_blue1]camlhmp[/deep_sky_blue1] with following parameters:[/italic]",
"[italic]Running [deep_sky_blue1]camlhmp-blast-targets[/deep_sky_blue1] with following parameters:[/italic]",
file=sys.stderr,
)
print(f"[italic] --input {input}[/italic]", file=sys.stderr)
Expand Down
299 changes: 299 additions & 0 deletions camlhmp/cli/blast/thresholds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
import logging
import os
import sys
from pathlib import Path

import rich
import rich.console
import rich.traceback
import rich_click as click
from rich import print
from rich.logging import RichHandler
from rich.table import Table

import camlhmp
from camlhmp.engines.blast import run_blast
from camlhmp.framework import check_types, get_types, print_camlhmp_version
from camlhmp.parsers.blast import get_blast_target_hits
from camlhmp.utils import file_exists_error, validate_file, parse_seqs, write_tsv

# Set up Rich
stderr = rich.console.Console(stderr=True)
rich.traceback.install(console=stderr, width=200, word_wrap=True, extra_lines=1)
click.rich_click.USE_RICH_MARKUP = True
click.rich_click.OPTION_GROUPS = {
"camlhmp": [
{
"name": "Required Options",
"options": [
"--input",
"--blast",
],
},
{
"name": "Filtering Options",
"options": [
"--min-pident",
"--min-coverage",
"--increment",
],
},
{
"name": "Additional Options",
"options": [
"--prefix",
"--outdir",
"--force",
"--verbose",
"--silent",
"--version",
"--help",
],
},
]
}


@click.command()
@click.option(
"--input",
"-i",
required=False if "--version" in sys.argv else True,
help="Input file in FASTA format of reference sequences",
)
@click.option(
"--blast",
"-b",
required=False if "--version" in sys.argv else True,
type=click.Choice(["blastn", "blastp", "blastx", "tblastn", "tblastx"], case_sensitive=True),
help="The blast algorithm to use",
)
@click.option(
"--outdir",
"-o",
type=click.Path(exists=False),
default="./camlhmp-blast-thresholds",
show_default=True,
help="Directory to write output",
)
@click.option(
"--prefix",
"-p",
type=str,
default="camlhmp",
show_default=True,
help="Prefix to use for output files",
)
@click.option(
"--min-pident",
default=70,
show_default=True,
help="Minimum percent identity to test",
)
@click.option(
"--min-coverage",
default=70,
show_default=True,
help="Minimum percent coverage to test",
)
@click.option(
"--increment",
default=1,
show_default=True,
help="The value to increment the thresholds by",
)
@click.option("--force", is_flag=True, help="Overwrite existing reports")
@click.option("--verbose", is_flag=True, help="Increase the verbosity of output")
@click.option("--silent", is_flag=True, help="Only critical errors will be printed")
@click.option("--version", is_flag=True, help="Print schema and camlhmp version")
def camlhmp_blast_thresholds(
input,
blast,
prefix,
min_pident,
min_coverage,
increment,
outdir,
force,
verbose,
silent,
version,
):
"""🐪 camlhmp-blast-thresholds 🐪 - Determine the specificity thresholds for a set of reference sequences"""
# Setup logs
logging.basicConfig(
format="%(asctime)s:%(name)s:%(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[
RichHandler(rich_tracebacks=True, console=rich.console.Console(stderr=True))
],
)
logging.getLogger().setLevel(
logging.ERROR if silent else logging.DEBUG if verbose else logging.INFO
)

# If prompted, print the schema and camlhmp version, then exit
if version:
print_camlhmp_version()

# Verify remaining input files
input_path = validate_file(input)
logging.debug(f"Processing {input}")

# Create the output directory
logging.debug(f"Creating output directory: {outdir}")
Path(f"{outdir}/reference_seqs").mkdir(parents=True, exist_ok=True)

# Output files
thresholds_tsv = f"{outdir}/{prefix}.tsv".replace("//", "/")

# Make sure output files don't already exist
file_exists_error(thresholds_tsv, force)

# Describe the command line arguments
console = rich.console.Console(stderr=True)
print(
"[italic]Running [deep_sky_blue1]camlhmp-blast-thresholds[/deep_sky_blue1] with following parameters:[/italic]",
file=sys.stderr,
)
print(f"[italic] --input {input}[/italic]", file=sys.stderr)
print(f"[italic] --blast {input}[/italic]", file=sys.stderr)
print(f"[italic] --outdir {outdir}[/italic]", file=sys.stderr)
print(f"[italic] --prefix {prefix}[/italic]", file=sys.stderr)
print(f"[italic] --min-pident {min_pident}[/italic]", file=sys.stderr)
print(f"[italic] --min-coverage {min_coverage}[/italic]\n", file=sys.stderr)

print(
f"[italic]Gathering seqeuences from {input}...[/italic]",
file=sys.stderr,
)

# Get reference seqeunce names
reference_seqs = {}
for seq in parse_seqs(input_path, "fasta"):
if seq.id not in reference_seqs:
reference_seqs[seq.id] = []
reference_seqs[seq.id].append(seq.seq)

# Write seqs for each reference into separate files
print(
f"[italic]Writing reference seqeuences to {outdir}/reference_seqs...[/italic]",
file=sys.stderr,
)
reference_failures = {}
for ref, seqs in sorted(reference_seqs.items()):
if ref not in reference_failures:
reference_failures[ref] = {
"pident": "-",
"coverage": "-",
"hits": "-",
"comment": "",
}
with open(f"{outdir}/reference_seqs/{ref}.fasta", "w") as fh:
for seq in seqs:
fh.write(f">{ref}\n{seq}\n")

# Run blast for each reference seq, adjusting thresholds each time
references = reference_seqs.keys()
max_pident_failure = 0
max_coverage_failure = 0
for ref in references:
print(f"Detecting failure for {ref}", file=sys.stderr)
pident = 100
has_failure = False
while pident >= min_pident:
coverage = 100
while coverage >= min_coverage:
# Run blast
logging.debug(f"Running {ref} with pident={pident} and coverage={coverage}")
hits, blast_stdout, blast_stderr = run_blast(
blast, f"{outdir}/reference_seqs/{ref}.fasta", input_path, pident, coverage
)
# Determine if we've gotten a hit that's not the reference, if so, we've failed
for hit, status in get_blast_target_hits(references, hits).items():
if status:
if hit.split("_")[0] != ref.split("_")[0]:
has_failure = True
reference_failures[ref]["pident"] = str(pident)
reference_failures[ref]["coverage"] = str(coverage)
reference_failures[ref]["hits"] = ",".join(hits)

# Determine the max pident and coverage failures
if pident > max_pident_failure and pident != 100:
max_pident_failure = pident
if coverage > max_coverage_failure and coverage != 100:
max_coverage_failure = coverage

# Add comment about potential overlap or containment
if int(coverage) == 100 or int(pident) == 100:
reference_failures[ref]["comment"] = "Suspected overlap or containment with another target: "
print(
f"Detection failure for {ref} with pident={pident} and coverage={coverage} - {hits}",
file=sys.stderr,
)
coverage -= increment
if has_failure:
break
if has_failure:
break
pident -= increment

# Write the results
print(
f"[italic]Writing results to {thresholds_tsv}...[/italic]",
file=sys.stderr,
)

# Write the results
final_results = []
for ref, vals in sorted(reference_failures.items()):
final_results.append(
{
"reference": ref,
"pident": vals["pident"],
"coverage": vals["coverage"],
"hits": vals["hits"],
"comment": f"no detection failures for pident>={min_pident} and converage>={min_coverage}" if vals["hits"] == "-" else vals["comment"],
}
)
write_tsv(final_results, thresholds_tsv)

# Finalize the results
print("[italic]Final Results...[/italic]", file=sys.stderr)
type_table = Table(title=f"Thresholds Detection")
type_table.add_column("reference", style="white")
type_table.add_column("pident_failure", style="cyan")
type_table.add_column("coverage_failure", style="cyan")
type_table.add_column("hits", style="cyan")
type_table.add_column("comment", style="cyan")

for result in final_results:
type_table.add_row(
result["reference"],
result["pident"],
result["coverage"],
result["hits"],
result["comment"],
)
console.print(type_table)

# Print suggested thresholds
print(
f"[italic]Suggested thresholds for specificity: pident>{max_pident_failure} and coverage>{max_coverage_failure}[/italic]",
file=sys.stderr,
)
print(
f"[italic]**NOTE** these are suggestions for a starting point[/italic]",
file=sys.stderr,
)


def main():
if len(sys.argv) == 1:
camlhmp_blast_thresholds.main(["--help"])
else:
camlhmp_blast_thresholds()


if __name__ == "__main__":
main()
32 changes: 32 additions & 0 deletions camlhmp/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ def read_framework(yamlfile: str) -> dict:
return parse_yaml(yamlfile)


def print_camlhmp_version() -> None:
"""
Print the version of camlhmp, then exit
Args:
framework (dict): the parsed YAML framework
Examples:
>>> from camlhmp.framework import print_camlhmp_version
>>> print_camlhmp_version()
"""
print(f"camlhmp, version {camlhmp.__version__}", file=sys.stderr)
sys.exit(0)


def print_version(framework: dict) -> None:
"""
Print the version of the framework, then exit
Expand All @@ -43,6 +58,23 @@ def print_version(framework: dict) -> None:
sys.exit(0)


def print_versions(frameworks: list) -> None:
"""
Print the version of the framework, then exit
Args:
frameworks (list[dict]): A list of parsed YAML frameworks
Examples:
>>> from camlhmp.framework import print_version
>>> print_versions([framework1, framework2])
"""
print(f"camlhmp, version {camlhmp.__version__}", file=sys.stderr)
for framework in frameworks:
print(f"schema {framework['metadata']['id']}, version {framework['metadata']['version']}", file=sys.stderr)
sys.exit(0)


def get_types(framework: dict) -> dict:
"""
Get the types from the framework.
Expand Down
Loading

0 comments on commit 14090cb

Please sign in to comment.