diff --git a/.gitignore b/.gitignore index 0d2f9dc..66d9ec9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.tsv camlhmp-extract/ +camlhmp-blast-thresholds/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 693bf5d..b694ae4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog -## v1.0.1 rpetit3/camlhmp "" 2024/08/?? +## v1.0.1 rpetit3/camlhmp "Bactrian" 2024/08/25 ### `Added` diff --git a/camlhmp/cli/blast/thresholds.py b/camlhmp/cli/blast/thresholds.py index 4a768f0..5cbcda7 100644 --- a/camlhmp/cli/blast/thresholds.py +++ b/camlhmp/cli/blast/thresholds.py @@ -157,7 +157,7 @@ def camlhmp_blast_thresholds( file=sys.stderr, ) print(f"[italic] --input {input}[/italic]", file=sys.stderr) - print(f"[italic] --blast {input}[/italic]", file=sys.stderr) + print(f"[italic] --blast {blast}[/italic]", file=sys.stderr) print(f"[italic] --outdir {outdir}[/italic]", file=sys.stderr) print(f"[italic] --prefix {prefix}[/italic]", file=sys.stderr) print(f"[italic] --min-pident {min_pident}[/italic]", file=sys.stderr) @@ -228,7 +228,7 @@ def camlhmp_blast_thresholds( if int(coverage) == 100 or int(pident) == 100: reference_failures[ref]["comment"] = "Suspected overlap or containment with another target: " print( - f"Detection failure for {ref} with pident={pident} and coverage={coverage} - {hits}", + f"Detected failure for {ref} with pident={pident} and coverage={coverage} - {hits}", file=sys.stderr, ) coverage -= increment @@ -250,10 +250,10 @@ def camlhmp_blast_thresholds( final_results.append( { "reference": ref, - "pident": vals["pident"], - "coverage": vals["coverage"], + "pident_failure": vals["pident"], + "coverage_failure": vals["coverage"], "hits": vals["hits"], - "comment": f"no detection failures for pident>={min_pident} and converage>={min_coverage}" if vals["hits"] == "-" else vals["comment"], + "comment": f"no detection failures for pident>={min_pident} and coverage>={min_coverage}" if vals["hits"] == "-" else vals["comment"], } ) write_tsv(final_results, thresholds_tsv) @@ -270,8 +270,8 @@ def camlhmp_blast_thresholds( for result in final_results: type_table.add_row( result["reference"], - result["pident"], - result["coverage"], + result["pident_failure"], + result["coverage_failure"], result["hits"], result["comment"], ) diff --git a/docs/cli/blast/camlhmp-blast-thresholds.md b/docs/cli/blast/camlhmp-blast-thresholds.md new file mode 100644 index 0000000..a466f38 --- /dev/null +++ b/docs/cli/blast/camlhmp-blast-thresholds.md @@ -0,0 +1,153 @@ +--- +title: camlhmp-blast-thresholds +description: >- + Determine the specificity thresholds for a set of reference sequences +--- + +# `camlhmp-blast-thresholds` + +`camlhmp-blast-thresholds` is a command that allows users to determine the specificity +percent identity and coverage thresholds when using BLAST+. This command will start at +100 percent identity and coverage and work its way down until a reference sequence can +no longer be distinguished from other reference sequences. + +## Usage + +```bash + Usage: camlhmp-blast-thresholds [OPTIONS] + + ๐Ÿช camlhmp-blast-thresholds ๐Ÿช - Determine the specificity thresholds for a set of + reference sequences + +โ•ญโ”€ Options โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ +โ”‚ * --input -i TEXT Input file in FASTA format of โ”‚ +โ”‚ reference sequences โ”‚ +โ”‚ [required] โ”‚ +โ”‚ * --blast -b [blastn|blastp|blastx|tblastn The blast algorithm to use โ”‚ +โ”‚ |tblastx] [required] โ”‚ +โ”‚ --outdir -o PATH Directory to write output โ”‚ +โ”‚ [default: โ”‚ +โ”‚ ./camlhmp-blast-thresholds] โ”‚ +โ”‚ --prefix -p TEXT Prefix to use for output files โ”‚ +โ”‚ [default: camlhmp] โ”‚ +โ”‚ --min-pident INTEGER Minimum percent identity to โ”‚ +โ”‚ test โ”‚ +โ”‚ [default: 70] โ”‚ +โ”‚ --min-coverage INTEGER Minimum percent coverage to โ”‚ +โ”‚ test โ”‚ +โ”‚ [default: 70] โ”‚ +โ”‚ --increment INTEGER The value to increment the โ”‚ +โ”‚ thresholds by โ”‚ +โ”‚ [default: 1] โ”‚ +โ”‚ --force Overwrite existing reports โ”‚ +โ”‚ --verbose Increase the verbosity of โ”‚ +โ”‚ output โ”‚ +โ”‚ --silent Only critical errors will be โ”‚ +โ”‚ printed โ”‚ +โ”‚ --version Print schema and camlhmp โ”‚ +โ”‚ version โ”‚ +โ”‚ --help Show this message and exit. โ”‚ +โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ +``` + +## Example Usage + +To run `camlhmp-blast-thresholds`, you will need a FASTA file of your reference sequences. +Below is an example of how to run `camlhmp-blast-thresholds` using available test data. + +```bash +camlhmp-blast-thresholds \ + --input tests/data/blast/targets/sccmec-partial.fasta \ + --blast blastn + +Running camlhmp-blast-thresholds with following parameters: + --input tests/data/blast/targets/sccmec-partial.fasta + --blast blastn + --outdir ./camlhmp-blast-thresholds + --prefix camlhmp + --min-pident 70 + --min-coverage 70 + +Gathering seqeuences from tests/data/blast/targets/sccmec-partial.fasta... +Writing reference seqeuences to ./camlhmp-blast-thresholds/reference_seqs... +Detecting failure for ccrA1 +Detected failure for ccrA1 with pident=75 and coverage=100 - ['ccrA1', 'ccrA2'] +Detecting failure for ccrA2 +Detected failure for ccrA2 with pident=75 and coverage=100 - ['ccrA1', 'ccrA2'] +Detecting failure for ccrA3 +Detected failure for ccrA3 with pident=75 and coverage=95 - ['ccrA1', 'ccrA3'] +Detecting failure for ccrB1 +Detecting failure for ccrB2 +Detecting failure for ccrB3 +Detecting failure for IS1272 +Detecting failure for mecI +Detecting failure for mecR1 +Detecting failure for mecA +Detecting failure for IS431 +Writing results to ./camlhmp-blast-thresholds/camlhmp.tsv... +Final Results... + Thresholds Detection +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ +โ”ƒ reference โ”ƒ pident_failure โ”ƒ coverage_failure โ”ƒ hits โ”ƒ comment โ”ƒ +โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ +โ”‚ IS1272 โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ”‚ IS431 โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ”‚ ccrA1 โ”‚ 75 โ”‚ 100 โ”‚ ccrA1,ccrA2 โ”‚ Suspected overlap or containment with another target: โ”‚ +โ”‚ ccrA2 โ”‚ 75 โ”‚ 100 โ”‚ ccrA1,ccrA2 โ”‚ Suspected overlap or containment with another target: โ”‚ +โ”‚ ccrA3 โ”‚ 75 โ”‚ 95 โ”‚ ccrA1,ccrA3 โ”‚ โ”‚ +โ”‚ ccrB1 โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ”‚ ccrB2 โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ”‚ ccrB3 โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ”‚ mecA โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ”‚ mecI โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ”‚ mecR1 โ”‚ - โ”‚ - โ”‚ - โ”‚ no detection failures for pident>=70 and coverage>=70 โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +Suggested thresholds for specificity: pident>75 and coverage>95 +**NOTE** these are suggestions for a starting point +``` + +!!! Note + The final results will suggest a starting point for the specificity thresholds. You should + still validate these thresholds with your own data. You may also notice that shorter + sequences are more susceptible to percent identity and longer sequences are more susceptible + to coverage. + +## Output Files + +`camlhmp-blast-thresholds` will generate an output directory called `camlhmp-blast-thresholds` +by default and in there will be the individual reference sequences and a final table of +suggested cutoffs. + +| File Name | Description | +|------------------------|-----------------------------------------------------------------------------------------| +| `{PREFIX}.tsv` | A tab-delimited file the threshold failures (if observed) of each reference sequence | + +### {PREFIX}.tsv + +The `{PREFIX}.tsv` file is a tab-delimited file the threshold failures (if observed) of each +reference sequence . The columns are: + +| Column | Description | +|------------------|------------------------------------------------------------------------------------| +| reference | The reference sequence name | +| pident_failure | The point at which percent identity no longer differentiated from other sequences | +| coverage_failure | The point at which coverage identity no longer differentiated from other sequences | +| hits | The other reference sequences that also had a hit | +| comment | A small comment about the result | + +Below is an example of the `{PREFIX}.tsv` file: + +```tsv +reference pident_failure coverage_failure hits comment +IS1272 - - - no detection failures for pident>=70 and coverage>=70 +IS431 - - - no detection failures for pident>=70 and coverage>=70 +ccrA1 75 100 ccrA1,ccrA2 Suspected overlap or containment with another target: +ccrA2 75 100 ccrA1,ccrA2 Suspected overlap or containment with another target: +ccrA3 75 95 ccrA1,ccrA3 +ccrB1 - - - no detection failures for pident>=70 and coverage>=70 +ccrB2 - - - no detection failures for pident>=70 and coverage>=70 +ccrB3 - - - no detection failures for pident>=70 and coverage>=70 +mecA - - - no detection failures for pident>=70 and coverage>=70 +mecI - - - no detection failures for pident>=70 and coverage>=70 +mecR1 - - - no detection failures for pident>=70 and coverage>=70 +``` diff --git a/mkdocs.yml b/mkdocs.yml index 02db255..bd4273d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -149,6 +149,7 @@ nav: - 'blast-alleles': 'cli/blast/camlhmp-blast-alleles.md' - 'blast-regions': 'cli/blast/camlhmp-blast-regions.md' - 'blast-targets': 'cli/blast/camlhmp-blast-targets.md' + - 'blast-thresholds': 'cli/blast/camlhmp-blast-thresholds.md' - 'Utility': - 'camlhmp-extract': 'cli/camlhmp-extract.md' - 'API':