diff --git a/.gitignore b/.gitignore index c277cc9..263cc94 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ Cargo.lock lib/ out_dir/ slurm.sh +downloads/ +test_database/ diff --git a/Cargo.toml b/Cargo.toml index 2258ff0..1f8cc06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [workspace] members = [ "ncbi", - "kr2r" -] + "kr2r", + "seqkmer"] resolver = "2" diff --git a/README.md b/README.md index d276f9a..830805e 100644 --- a/README.md +++ b/README.md @@ -1,237 +1,134 @@ -# kraken2-rust +This workspace contains two projects: `kr2r` and `ncbi`. The `kr2r` project includes an example that demonstrates how to use the `kun_peng` binary, a tool for processing gene classification, to build a database and process a sample file. -## 0.Installation Instructions -To install kraken2-rust, follow these steps: +## Get Started -1. Download the appropriate version for your system: +Follow these steps to build the projects and run the example. -Navigate to the Releases page of the kraken2-rust GitHub repository. -Select the release suitable for your operating system. For example, if you are using CentOS 7, download kraken-rust-${VERSION}-centos7.tar.gz, where ${VERSION} is the version number of the release you wish to install. +### Build the Projects -2. Extract the downloaded archive: +First, ensure that both projects are built. You can do this by running the following command from the root of the workspace: -Open a terminal. -Use the tar command to extract the files from the archive - -```bash -tar -xvf kraken-rust-${VERSION}-centos7.tar.gz +```sh +cargo build --release ``` -## 1. NCBI download tool - -Downloading Genome Data with NCBI Tool -The ncbi command-line tool offers functionality to download genome data from the NCBI database. This can be done for various groups including archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other and invertebrate. - - -## Key Features -* Resumable Downloads: The tool supports breakpoint resumption, allowing downloads to pause and resume without starting over. This is particularly useful for large files or in conditions of unstable network connections. - -* Incremental Download: Users can perform incremental downloads, where only new or updated files in the directory are downloaded. This saves time and bandwidth by avoiding redundant downloads of previously obtained data. +This will build the kr2r and ncbi project in release mode. -* Automatic MD5 Verification: To ensure data integrity, the tool automatically verifies the MD5 checksum of downloaded files. This step confirms that the files are not corrupted or tampered with during the download process. +### Run the `kun_peng` Example -### Genomes Command -To download genome data, use the genomes command with the following syntax: +Next, run the example script that demonstrates how to use the kun_peng binary. Execute the following command from the root of the workspace: -```bash -ncbi genomes [OPTIONS] --group [COMMAND] +```sh +cargo run --release --example build_and_classify --package kr2r ``` -### Subcommands +This will run the build_and_classify.rs example located in the kr2r project's examples directory. -* md5: Checks the md5 of the file only. -* fna: Parses genomic files and generates library fna files. -* assembly: Downloads and parses assembly files only. -* url: Downloads genomic files from a specified URL address. -* help: Print this message or the help of the given subcommand(s). -### Options -* --site : Choose the NCBI site directory to download from (RefSeq or GenBank). Defaults to refseq. Possible values are: -*genbank*: Download genbank resources. -*refseq*: Download refseq resources. -*all*: Download genbank and refseq resources. +Example Output +You should see output similar to the following: -* --asm-level : Set the assembly level for the download. Default is `basic`. ["Complete Genome", "Chromosome"]. `all` is ["Complete Genome", "Chromosome", "Scaffold", "Contig"]. -* -g, --group : Specifies the category of data to download from the NCBI site. The group can be one or a comma-separated list of the following: archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other, invertebrate. -* -h, --help: Print help information (for a summary, use '-h'). +```txt +Executing command: /path/to/workspace/target/release/kun_peng build --download-dir data/ --db test_database +kun_peng build output: [build output here] +kun_peng build error: [any build errors here] -### Examples - -To download genome data for bacteria from RefSeq: - -```bash -ncbi genomes --group bacteria --site refseq +Executing command: /path/to/workspace/target/release/kun_peng direct --db test_database data/COVID_19.fa +kun_peng direct output: [direct output here] +kun_peng direct error: [any direct errors here] ``` -To check the md5 of genomic files for fungi: -```bash -ncbi genomes --group fungi md5 -``` +This output confirms that the kun_peng commands were executed successfully and the files were processed as expected. -For more detailed help on a specific command, you can use the help subcommand: -```bash -ncbi help genomes -``` -This tool simplifies the process of downloading and processing genome data from NCBI, making it accessible for various research and analysis purposes. +Run the `ncbi` Example +Run the example script in the ncbi project to download the necessary files. Execute the following command from the root of the workspace: - -### Generate fna file - -```bash -ncbi gen --site all -g archaea fna +```sh +cargo run --release --example run_download --package ncbi ``` +This will run the run_download.rs example located in the ncbi project's examples directory. The script will: -## 2 Squid Tool - -Squid is a versatile command-line tool designed for the efficient processing and classification of biological sequences. With its suite of functionalities, Squid facilitates various tasks related to sequence analysis, taxonomy resolution, and database management, making it an essential utility for bioinformatics workflows. - -### Features -Squid offers a wide range of commands, each tailored for specific aspects of sequence data processing: - -* estimate: Estimate the capacity requirements for database construction or analysis, aiding in resource planning. -* seqid2taxid: Generate a mapping file from sequence identifiers to taxonomic IDs, facilitating the association of sequences with their respective taxonomic lineage. -* build: Construct a Squid database from a collection of sequences, optimizing it for subsequent analysis tasks. -* hashshard: Divide a hash file into smaller, more manageable shards, improving the efficiency of data processing. -* splitr: Split FASTQ or FASTA files into ranges based on sequence identifiers or other criteria, aiding in the parallel processing of large datasets. -* annotate: Annotate a set of sequences with taxonomic or other relevant information, enriching the dataset for further analysis. -* resolve: Resolve the taxonomic tree for a set of sequences, identifying their positions within the taxonomic hierarchy. -* classify: A comprehensive workflow that integrates splitr, annotate, and resolve into a unified process for the classification of sequence data. This command streamlines the task of * assigning taxonomic classifications to sequences. - -### Getting Started -To get started with Squid, you can invoke the tool with the -h or --help option to display detailed help messages for each command: - - -```bash -./kun_peng -h -Usage: kun_peng +1. Ensure the necessary directories exist. +2. Download the required files using the ncbi binary with the following commands: + * ./target/release/ncbi -d downloads gen -g archaea + * ./target/release/ncbi -d downloads tax -Commands: - estimate estimate capacity - seqid2taxid seqid to taxid map file - build build database - hashshard split hash file - splitr Split fast(q/a) file into ranges - annotate annotate a set of sequences - resolve resolve taxonomy tree - classify Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences - help Print this message or the help of the given subcommand(s) -Options: - -h, --help Print help - -V, --version Print version -``` - -This will provide you with an overview of all available commands and options. For specific information about a subcommand, use: +Example Output +You should see output similar to the following: +```txt +Executing command: /path/to/workspace/target/release/ncbi -d /path/to/workspace/downloads gen -g archaea +NCBI binary output: [download output here] -```bash -./kun_peng -h +Executing command: /path/to/workspace/target/release/ncbi -d /path/to/workspace/downloads tax +NCBI binary output: [download output here] ``` -Replace with the name of the subcommand for which you need detailed help, such as estimate, build, or classify. -## 2.1 Seqid2taxid Tool +## ncbi tool -The seqid2taxid tool is a utility within the kr2r package designed to facilitate the mapping of sequence identifiers (seqid) to taxonomic identifiers (taxid). This tool is essential for researchers and bioinformaticians working with genomic data, enabling them to easily relate sequence data to taxonomic information. +The ncbi binary is used to download resources from the NCBI website. Here is the help manual for the ncbi binary: +```sh +./target/release/ncbi -h +ncbi download resource -### Usage +Usage: ncbi [OPTIONS] -```bash -kun_peng seqid2taxid -h - -Usage: kun_peng seqid2taxid [OPTIONS] --source +Commands: + taxonomy Download taxonomy files from NCBI (alias: tax) + genomes Download genomes data from NCBI (alias: gen) + help Print this message or the help of the given subcommand(s) Options: - --source the database directory - -f, --map-file seqid2taxid.map file path, default = $source/seqid2taxid.map - -h, --help Print help - -V, --version Print version - -``` - -To use the seqid2taxid tool, execute it with the required and optional arguments as follows: - -```bash -kun_peng seqid2taxid [OPTIONS] --source + -d, --download-dir Directory to store downloaded files [default: lib] + -n, --num-threads Number of threads to use for downloading [default: 20] + -h, --help Print help (see more with '--help') + -V, --version Print version ``` -### Required Options -* --source : Specifies the database directory containing the sequence and taxonomic data. - -### Optional Options -* -f, --map-file : Path to the seqid2taxid.map file. If not specified, the tool defaults to using $source/seqid2taxid.map, where $source is the path provided by the * --source option. -* -h, --help: Displays help information about the tool and its options. -* -V, --version: Prints the version of the seqid2taxid tool. - -### Example Command -To run the seqid2taxid tool with a specific source directory: - -```bash -kun_peng seqid2taxid --source /path/to/database -``` -To specify a custom map file path: +## kun_peng tool -```bash -kun_peng seqid2taxid --source /path/to/database -f /path/to/custom/seqid2taxid.map -``` - -## 2.2 Estimate Capacity Tool - -The estimate_capacity tool is designed for estimating the capacity required for building a database from genomic data. It takes into consideration various parameters related to the genomic data processing, such as k-mer length, minimizer length, and hash table load factor, to provide an efficient estimate of the necessary resources. - -### Usage - -To use the estimate_capacity tool, execute it from the command line with the desired options: +```sh +Usage: kun_peng -```bash -kun_peng estimate_capacity [OPTIONS] -``` +Commands: + estimate estimate capacity + build build `k2d` files + hashshard split hash file + splitr Split fast(q/a) file into ranges + annotate annotate a set of sequences + resolve resolve taxonomy tree + classify Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences + direct Directly load all hash tables for classification annotation + merge-fna A tool for processing genomic files + help Print this message or the help of the given subcommand(s) -Options -* --source : Specifies the build database directory or file. Default is lib. -* --cache: Estimates capacity from cache if exists. -* -k, --k-mer : Sets the length of k-mers. K must be a positive integer (default is 35). K cannot be less than L. -* -l, --l-mer : Sets the length of minimizers. L must be between 1 and 31 (default is 31). -* -n, --n : Sets the maximum qualifying hash code (default is 4). -* --minimizer-spaces : Specifies the number of characters in the minimizer that are ignored in comparisons (default is 7). -* -T, --toggle-mask : Defines the minimizer ordering toggle mask. -* --load-factor : Sets the proportion of the hash table to be populated (only for build task; default is 0.7, must be between 0 and 1). -* -p, --threads : Specifies the number of threads to use (default is 10). -* -h, --help: Prints help information (for more details, use '--help'). -* -V, --version: Prints the version of the tool. - -### Example - -```bash -kun_peng estimate_capacity -k 35 -l 31 --source /data/ncbi/path -p 10 --load-factor 0.7 +Options: + -h, --help Print help + -V, --version Print version ``` -### Output - -```bash -estimate count: 1213069985, required capacity: 1732968825.0, Estimated hash table requirement: 6.46GB -``` +### build database -## 2.3 build +Build the kun_peng database like Kraken2, specifying the directory for the data files downloaded from NCBI, as well as the database directory. -```bash -./kun_peng build -h +```sh +./target/release/kun_peng build -h build database -Usage: kun_peng build [OPTIONS] --source -H -o -t -m --ncbi-taxonomy-directory --required-capacity --chunk-dir +Usage: kun_peng build [OPTIONS] --download-dir --db Options: - --source - build database directory or file - -H - Kraken 2 hash table filename - -o - Kraken 2 options filename + -d, --download-dir + Directory to store downloaded files + --db + ncbi library fna database directory -k, --k-mer Set length of k-mers, k must be positive integer, k=35, k cannot be less than l [default: 35] -l, --l-mer @@ -245,19 +142,7 @@ Options: -r, --requested-bits-for-taxid Bit storage requested for taxid 0 <= r < 31 [default: 0] -p, --threads - Number of threads [default: 4] - -t - Kraken 2 taxonomy filename - -m - Sequence ID to taxon map filename - -n, --ncbi-taxonomy-directory - NCBI taxonomy directory name - -c, --required-capacity - - --chunk-dir - chunk directory - --chunk-size - chunk size 1-4(GB) [default: 1073741824] + Number of threads [default: 10] --cache estimate capacity from cache if exists --max-n @@ -270,92 +155,46 @@ Options: Print version ``` -## 2.4 hashshard -```bash -./kun_peng hashshard -h -split hash file +### classify -Usage: kun_peng hashshard [OPTIONS] --db +The classification process is divided into three modes: -Options: - --db The database directory for the Kraken 2 index. contains index file(hash.k2d opts.k2d taxo.k2d) - --hash-dir database hash chunk directory and other files - --hash-capacity default: 1073741824(capacity 1G = file size 4G) - -h, --help Print help (see more with '--help') - -V, --version Print version -``` +1. Direct Processing Mode: -## 2.5 splitr +* Description: In this mode, all database files are loaded simultaneously, which requires a significant amount of memory. Before running this mode, you need to check the total size of hash_*.k2d files in the database directory using the provided script (bash cal_memory.sh out_dir). Ensure that your available memory meets or exceeds this size. +* Characteristics: + * High memory requirements + * Fast performance -```bash -./kun_peng splitr -h -Split fast(q/a) file into ranges +```sh +./target/release/kun_peng direct -h +Directly load all hash tables for classification annotation -Usage: kun_peng splitr [OPTIONS] --hash-dir --chunk-dir [INPUT_FILES]... +Usage: kun_peng direct [OPTIONS] --db [INPUT_FILES]... Arguments: [INPUT_FILES]... A list of input file paths (FASTA/FASTQ) to be processed by the classify program Options: - --hash-dir + --db database hash chunk directory and other files -P, --paired-end-processing Enable paired-end processing -S, --single-file-pairs Process pairs with mates in the same file -Q, --minimum-quality-score - Minimum quality score for FASTQ data, default is 0 [default: 0] - -p, --num-threads - The number of threads to use, default is 1 [default: 10] - --chunk-dir - chunk directory - -h, --help - Print help (see more with '--help') - -V, --version - Print version -``` - -## 2.6 annotate - -```bash -annotate a set of sequences - -Usage: kun_peng annotate [OPTIONS] --hash-dir --chunk-dir - -Options: - --hash-dir database hash chunk directory and other files - --chunk-dir The file path for the Kraken 2 options. chunk directory - --batch-size 批量处理大小 default: 8MB [default: 8388608] - -h, --help Print help (see more with '--help') - -V, --version Print version -``` - - -## 2.7 resolve - -```bash -resolve taxonomy tree - -Usage: kun_peng resolve [OPTIONS] --hash-dir --chunk-dir - -Options: - --hash-dir - database hash chunk directory and other files - --chunk-dir - chunk directory - --full-output - output file contains all unclassified seq + Minimum quality score for FASTQ data [default: 0] -T, --confidence-threshold - Confidence score threshold, default is 0.0 [default: 0] + Confidence score threshold [default: 0] -K, --report-kmer-data In comb. w/ -R, provide minimizer information in report -z, --report-zero-counts In comb. w/ -R, report taxa w/ 0 count -g, --minimum-hit-groups The minimum number of hit groups needed for a call [default: 2] - --batch-size - 批量处理大小 default: 8MB [default: 8388608] + -p, --num-threads + The number of threads to use [default: 10] --output-dir File path for outputting normal Kraken output -h, --help @@ -364,39 +203,46 @@ Options: Print version ``` +2. Chunk Processing Mode: -## 2.8 classify +* Description: This mode processes the sample data in chunks, loading only a small portion of the database files at a time. This reduces the memory requirements, needing a minimum of 4GB of memory plus the size of one pair of sample files. +* Characteristics: + * Low memory consumption + * Slower performance compared to Direct Processing Mode -```bash -./kun_peng classify -h + +```sh +./target/release/kun_peng classify -h Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences -Usage: kun_peng classify [OPTIONS] --hash-dir --chunk-dir [INPUT_FILES]... +Usage: kun_peng classify [OPTIONS] --db --chunk-dir [INPUT_FILES]... Arguments: [INPUT_FILES]... A list of input file paths (FASTA/FASTQ) to be processed by the classify program Options: - --hash-dir - database hash chunk directory and other files + --db + + --chunk-dir + chunk directory + --output-dir + File path for outputting normal Kraken output -P, --paired-end-processing Enable paired-end processing -S, --single-file-pairs Process pairs with mates in the same file -Q, --minimum-quality-score - Minimum quality score for FASTQ data, default is 0 [default: 0] + Minimum quality score for FASTQ data [default: 0] -p, --num-threads - The number of threads to use, default is 1 [default: 10] - --chunk-dir - chunk directory + The number of threads to use [default: 10] --batch-size - 批量处理大小 default: 8MB [default: 8388608] + 批量处理大小 default: 16MB [default: 16777216] -T, --confidence-threshold - Confidence score threshold, default is 0.0 [default: 0] + Confidence score threshold [default: 0] -g, --minimum-hit-groups The minimum number of hit groups needed for a call [default: 2] - --output-dir - File path for outputting normal Kraken output + --kraken-db-type + Enables use of a Kraken 2 compatible shared database -K, --report-kmer-data In comb. w/ -R, provide minimizer information in report -z, --report-zero-counts @@ -409,149 +255,10 @@ Options: Print version ``` -## 3. build_k2_db - -The build_k2_db command-line tool facilitates the construction of a Kraken 2 database. It requires specific filenames for the hash table, taxonomy, and the sequence ID to taxon map, among other parameters. - - -### Introduction -The build_k2_db tool introduces a novel approach to constructing Kraken 2-compatible databases, specifically addressing the challenges associated with the large memory requirements of previous methods. This documentation outlines the process flow, working principles, and the inherent advantages of using the build_k2_db tool for genomic database construction. - -The build_k2_db tool revolutionizes the process of building genomic databases for Kraken 2 by introducing a novel, two-step approach to database construction. This method significantly mitigates the challenges associated with the large memory requirements of traditional database building processes, particularly vital for constructing databases like the NCBI RefSeq, which are substantial in size. - -### Working Principle -#### Step 1: Preprocessing and Generation of k2 Formatted Files -Initially, the tool preprocesses .fna files to generate intermediary files in a k2 format. This step involves scanning the .fna files to extract relevant k-mer and minimizer information, mapping these to taxonomic IDs, and then hashing these elements to produce indexed intermediary data. These intermediary files are crucial for the next step of the process, as they contain indexed positions and taxonomic IDs necessary for constructing the hash table efficiently. - -#### Step 2: Iterative Construction of the Hash Table -In the second phase, the tool iteratively processes the k2 formatted intermediary files to build segments of the hash table. This method involves reading the intermediary files in batches, resolving any taxonomic ID conflicts using a Lowest Common Ancestor (LCA) algorithm, and updating the hash table with the resolved IDs. This step-by-step processing significantly reduces the memory footprint compared to loading the entire hash table into memory at once. - -#### Efficiency and Advantages -The build_k2_db tool introduces several advantages over traditional database building methods: - -* Memory Efficiency: By generating intermediary files and processing these in chunks, the tool drastically reduces the required memory, enabling the construction of large databases on systems with limited memory capacity. -* Scalability: The approach is highly scalable, thanks to parallel processing and efficient handling of large .fna files, making it suitable for building extensive databases. -Time Efficiency: Despite the intermediary files being substantially larger than the final hash table, the overall time taken to build the database is comparable to methods that process all data at once. -Performance Insights -In a performance test involving the NCBI RefSeq database, approximately 500GB of .fna files were processed to generate 850GB of k2 intermediary files. The final hash table size amounted to 188GB. Utilizing a machine equipped with a 16-core CPU and 32GB of memory, the entire database construction process was completed in just 9 hours and 42 minutes. This showcases the tool's ability to handle large datasets efficiently, both in terms of time and hardware resource requirements. - -#### Comparative Analysis with Kraken 2 C++ Version in Fast Mode - -In addition to the innovative build_k2_db Rust-based tool, it's informative to compare its performance and resource utilization with that of the traditional Kraken 2 C++ version, particularly in its fast mode operation. Such a comparison underscores the advancements and efficiencies introduced by the Rust implementation. - -#### Kraken 2 C++ Version in Fast Mode: -For processing the same dataset from the NCBI RefSeq database (~500GB of .fna files), the Kraken 2 C++ version in fast mode presents the following resource requirements and performance metrics: - -CPU and Memory Usage: Requires a machine with a 16-core CPU and 200GB of memory, indicating a significantly higher demand for memory resources compared to the Rust-based build_k2_db tool. -Time Efficiency: Completes the database construction process in approximately 9 hours and 32 minutes. This duration is slightly shorter than that of the build_k2_db tool but at the cost of substantially higher memory requirements. - - -#### Key Insights and Implications: -Memory Optimization: The build_k2_db tool demonstrates exceptional memory efficiency by requiring only 32GB of memory to process and construct a database from a large genomic dataset. In contrast, the C++ version's fast mode requires 200GB of memory, highlighting the Rust-based tool's optimization in memory usage. -Comparable Time Efficiency: Despite the vast difference in memory consumption, the time taken to build the database is remarkably similar between the two tools, with the Rust version completing the task in 9 hours and 42 minutes versus 9 hours and 32 minutes for the C++ version. -Accessibility and Cost-effectiveness: By drastically reducing the memory requirement, the build_k2_db tool makes the process of building large genomic databases more accessible to researchers and institutions with limited hardware resources. This can significantly lower the computational costs associated with database construction in bioinformatics research. - - -#### Conclusion -The build_k2_db tool stands out for its innovative approach to genomic database construction, offering a memory-efficient, scalable, and time-effective solution. Its ability to preprocess data into intermediary files before iteratively constructing the hash table addresses the significant challenges of working with large-scale genomic databases, making it an invaluable asset in the field of bioinformatics. - -The build_k2_db tool not only matches the Kraken 2 C++ version in terms of processing time but does so with far less memory, making it a highly efficient and accessible option for constructing large genomic databases. Its innovative approach, leveraging Rust's performance and memory management capabilities, offers a more practical solution for the bioinformatics community, particularly when handling extensive datasets like the NCBI RefSeq database. - - - -### Usage -To build the Kraken 2 database, you must specify source, hash table, taxonomy, ID to taxon map filenames, Kraken 2 options filename, NCBI taxonomy directory, required capacity, and chunk directory. - -```bash -build_k2_db [OPTIONS] --source -H -t -m -o --ncbi-taxonomy-directory --required-capacity --chunk-dir -``` - -### Options - -* --source : Directory or file for database build. -* -H : Filename for the Kraken 2 hash table. -* -t : Filename for the Kraken 2 taxonomy. -* -m : Filename for the sequence ID to taxon map. -* -o : Filename for Kraken 2 options. -* -n, --ncbi-taxonomy-directory : Directory name for NCBI taxonomy. -* -k, --k-mer : Length of k-mers (default: 35). -* -l, --l-mer : Length of minimizers (default: 31). -* -r, --requested-bits-for-taxid : Bit storage for taxid (default: 0). -* -T, --toggle-mask : Minimizer ordering toggle mask (default: 16392584516609989165). -* --minimizer-spaces : Characters in minimizer ignored in comparisons (default: 7). -* -c, --required-capacity : Required capacity for the database. -* -p, --threads : Number of threads (default: 4). -* --chunk-dir : Directory for chunks. -* --chunk-size : Size of chunks in GB (default: 1GB). -* --chunk-prefix : Prefix for chunk files (default: chunk). -* --only-k2: Process only k2 file. -* -h, --help: Prints help information. -* -V, --version: Prints the version of the tool. - -### Example - -Building a database with custom parameters: - -```bash -build_k2_db --source /path/to/source -H hash_table.k2 -t taxonomy.k2 -m id_to_taxon.map -o options.k2 --ncbi-taxonomy-directory /path/to/ncbi/taxonomy --required-capacity 1000000 --chunk-dir /path/to/chunks -``` - - -## 4. classify - -The classify tool is a powerful sequence classification program designed for rapid and accurate classification of nucleotide sequences. It leverages the Kraken 2 indexing and taxonomy systems to efficiently assign taxonomic labels to sequences from FASTA/FASTQ files. This document provides a comprehensive guide on how to use the classify tool, including its options and arguments. - -### Usage - -To classify sequences using the classify tool, execute the command with the required options and input files: - -```bash -classify [OPTIONS] --index-filename --taxonomy-filename --options-filename [INPUT_FILES]... -``` - -#### Arguments - -* [INPUT_FILES]...: Specifies a list of input file paths. These files should be in FASTA or FASTQ format and contain the sequences to be classified. - -#### Options - -* -H, --index-filename : Path to the Kraken 2 index file. This file is essential for the classification process. -* -t, --taxonomy-filename : Path to the Kraken 2 taxonomy file. This file contains taxonomic information used for classification. -* -o, --options-filename : Path to the Kraken 2 options file. This file includes additional configuration options for Kraken 2. -* -T, --confidence-threshold : Sets the confidence score threshold for classification. Sequences with a confidence score below this threshold will not be * classified. The default value is 0.0. -* -p, --num-threads : Specifies the number of threads to use for processing. Increasing the number of threads can speed up the classification process. The default is 10. -* -g, --minimum-hit-groups : The minimum number of hit groups required for a classification call. The default is 2. -* -P, --paired-end-processing: Enables processing of paired-end reads. This option should be used if your input files contain paired-end sequence data. -* -S, --single-file-pairs: Indicates that pairs with mates are located in the same file. This option is relevant for paired-end processing. -* -O, --kraken-output-filename : Specifies the file path for outputting the standard Kraken output. This output includes the classification results for * each sequence. -* -Q, --minimum-quality-score : Sets the minimum quality score for FASTQ data. Sequences with a quality score below this threshold will not be classified. * The default is 0. -* -h, --help: Prints help information, providing a summary of options and usage. -* -V, --version: Displays the version of the classify tool. - -#### Example - -To classify sequences from a FASTQ file using 4 threads and a confidence threshold of 0.5: - -```bash -classify --index-filename path/to/index --taxonomy-filename path/to/taxonomy --options-filename path/to/options -T 0.5 -p 4 input_file.fastq -``` - - -## 5. inspect - -The inspect tool is designed for analyzing the content of hash table files used by Kraken 2. It provides insights into the index file, allowing users to verify and understand the structure and statistics of their Kraken 2 databases. - -### Usage -To utilize the inspect tool, execute the command with the necessary options: - -```bash -inspect [OPTIONS] --index-filename -``` +3. Step-by-Step Processing Mode: -### Options -* -H, --index-filename : Specifies the file path to the Kraken 2 index file. This option is required as it directs the tool to the hash table file that needs to be inspected. -* -t, --taxonomy-filename : Provides the file path to the Kraken 2 taxonomy file. This file contains the taxonomy information that corresponds to the data in the index file. Including this option allows for a more comprehensive inspection that may involve taxonomy data. -* -o, --options-filename : Indicates the file path to the Kraken 2 options file. This file can contain various configurations and options used by Kraken 2. * Specifying this option can help understand the configurations under which the index was created or used. -* -v, --value-count: This flag, when set, instructs the tool to iterate through the index file and count the values. It is useful for users who wish to understand the * distribution of data within their Kraken 2 index file. -* -h, --help: Prints out help information, providing a brief summary of all the available options and their usage. -* -V, --version: Displays the version of the inspect tool, helping users to identify the tool's version they are currently using. +* Description: This mode breaks down the chunk processing mode into individual steps, providing greater flexibility in managing the entire classification process. +* Characteristics: + * Flexible processing steps + * Similar memory consumption to Chunk Processing Mode + * Performance varies based on execution steps diff --git a/cal_memory.sh b/cal_memory.sh new file mode 100644 index 0000000..92e1450 --- /dev/null +++ b/cal_memory.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +directory=$1 + +# Find all hash_*.k2d files and calculate their total size +total_size=$(find "$directory" -name "hash_*.k2d" -exec du -ch {} + | grep total$ | awk '{print $1}') +echo "Total size of hash_*.k2d files: $total_size" diff --git a/data/COVID_19.fa b/data/COVID_19.fa new file mode 100644 index 0000000..8fa02d5 --- /dev/null +++ b/data/COVID_19.fa @@ -0,0 +1,375 @@ +>kraken:taxid|2697049|NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAA +AATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGG +ACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT +CGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGC +CTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACAT +CTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAA +ACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC +GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAG +AACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGA +TCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACG +GAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTA +GCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCG +TGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAAT +TGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA +CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATG +CAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAG +CCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTT +GTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGG +CTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACA +AGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGT +CTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA +GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCAT +TCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAA +CAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCT +TGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGA +GACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTT +GTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGA +AGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA +TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTT +TTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCA +CTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAA +TTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCA +TTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGA +AATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTG +CACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT +GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGC +CTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACCACTGGGCATTGATTTAGATG +AGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTAC +CCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGA +AGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATT +GGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATT +GTTGAGGTTCAACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT +AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACAGTGGTTGTTA +ATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTT +GAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAA +ACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTA +ATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGT +GTAGATACTGTTCGCACAAATGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGA +AATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA +AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAA +ACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAG +TGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCAAGAGGGTGTTTTAACTGCTG +TGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAATGCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTAT +ATAACCACTTACCCGGGTCAGGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGC +CTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGC +TTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA +TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGC +GTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTATGTAACACATGGCTTAAATT +TGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCG +TATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAA +AGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTA +GTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTG +AGGACTATTAAGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA +ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGTAAAACATTTT +ATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGG +TACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAA +CAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTT +ATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTA +GGTGATGTTAGAGAAACAATGAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTG +TAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT +TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTT +ATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCA +GTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTTACTTACAAAGTCCTCAGAAT +ACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGTTACACAACAACCATAAAACCAGTTACTTATAAATTGGAT +GGTGTTGTTTGTACAGAAATTGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAAT +TGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTG +ATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT +GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTG +GCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAG +TTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAA +CCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGT +AGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTT +ATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCAT +GGTTTAGCTGCTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC +AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTATTGCTACAAT +TGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGT +GTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTG +GTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCA +TGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCT +ATACCTTGTAGTGTTTGTCTTAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTC +ATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT +ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATG +TGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTA +TGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTGTTACAAACGTAATAGAGCAA +CAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAGGTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGC +AAACTACACAATTGGAATTGTGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGA +CTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATG +GTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC +AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATC +ATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACTAGATCAGGCATTAGTGTCTG +ATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCA +ATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTAC +TTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCAC +ATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCC +CGTGACCTTGGTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT +ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAGAATAACTTAC +CTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATT +GTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACC +TGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTG +ACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTAT +ACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCAC +GATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC +CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCT +TCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACAC +ACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTCTGTTAGAGTGGTAACAACTT +TTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATCAGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTA +CTTAACAATGATTATTACAGATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTAC +ACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACAT +GCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC +CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTT +GACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTTCACACCTTTAGTACCTTTCT +GGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTA +GTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAA +GTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTG +GAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGT +TCTGATGTTCTTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC +ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTTGATGACGTAG +TTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCT +AATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCT +TAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAG +CTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAAT +GGTTCATGTGGTAGTGTTGGTTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAAC +TGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA +CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGA +TTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACAT +ACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAAAGAATTACTGCAAAATGGTA +TGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGATGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGT +GTTACTTTCCAAAGTGCAGTGAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTT +AGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTA +TTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC +ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATAC +TAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACTAATCCTTATGACAGCAAGAA +CTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAAT +GCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCAT +GTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTA +TAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGA +CTGACTCTTGGTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA +GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTAGCCACTGTAC +AGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCT +AAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGT +TTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGG +CAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAG +CAGGCTGTTGCTAATGGTGATTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGA +CCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG +AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTC +AACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGT +TGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGCATCAGCATTGTGGGAAATCC +AACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAGTGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCT +CTTATTGTAACAGCTTTAAGGGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGAT +GTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTA +GGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC +TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAA +AGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCTACAAGCTGGTAATGCAACAG +AAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCT +AGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACC +GGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATC +CTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTT +AAAAACACAGTCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA +GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAG +TACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTT +GTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTAC +CAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGA +CGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATT +TTGATGAAGGTAATTGTGACACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAG +GACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT +AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATG +GTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTG +TTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGACTTAACAAAGCCTTACATTAA +GTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACAT +ACCACCCAAATTGTGTTAACTGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTG +TTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCA +CTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT +ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCA +CTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTATGACTTTGCTGTGTCTAAGGG +TTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATG +ACTACTATCGTTATAATCTACCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTT +GATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAA +TAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTA +ATGTCATCCCTACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC +TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAGGAGCTACTGT +AGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACC +TTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGC +AAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGT +CATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTT +TTAACATTTGTCAAGCTGTCACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTC +CGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC +ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAG +GTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGG +ACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTTAAACAGGGTGATGATTATGT +GTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCCGGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACAC +TTATGATTGAACGGTTCGTGTCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTC +TTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCT +TACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG +TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAA +TGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTATGTTTGCAATGCTCCAGGTTG +TGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTC +CATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCA +ATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGC +AGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAG +AATTACATCTTTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT +AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACCGAGGTACAAC +AACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGC +CACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAAT +TATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCT +AGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCAT +TAAAATATTTGCCTATAGATAAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTG +AATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA +AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACC +CTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTT +ATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTT +GGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCAGCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGC +ATGATGTTTCATCTGCAATTAACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAA +GCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTC +ATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA +ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACA +AGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTCTTTAAAGATTGTAGTAAGGT +AATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTG +ACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAAT +GGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTG +TCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTAC +CTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA +CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACT +TAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAA +TAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGG +CATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACA +AAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAG +CTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCG +GCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA +CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTT +ATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGC +AATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTG +TGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAAC +AATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCA +CTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTA +TCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA +ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGT +GAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAAC +AACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATA +ATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGT +GTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGT +TGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCAT +CTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG +AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAG +TCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAAC +ATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAA +TCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTC +ATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAG +TTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACA +TTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT +ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATA +CTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCT +GATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAA +TGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTA +TTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGT +GGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTA +TAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG +GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACA +AATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTC +TTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAG +TTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAG +TCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTG +ACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCAT +GCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC +TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTG +TTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCAC +AAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCA +GCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTT +ATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTG +GTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGA +TTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA +ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTC +ACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTAC +AAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCA +ACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAA +TTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGG +GCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACA +ATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA +GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACA +ATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATG +CACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTA +ACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTAC +TGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACAC +CAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCA +GATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC +TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATT +CTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCT +TACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAA +GACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTT +GTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAA +CAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAG +CAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT +GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACA +GATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGC +ATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACC +AAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGA +AAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAAT +TTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCA +GACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT +ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCC +TCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTG +CCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAA +AGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGT +CAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATA +CATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTC +AATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC +ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTT +GCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAA +GGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAG +CAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTT +CGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGC +AACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTC +GTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT +AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGC +ATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACA +AGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATT +ACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCT +TCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTT +AATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGA +ACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT +TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTA +AAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTA +AATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCT +TAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATG +CCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTG +CTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCT +CAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC +TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGT +GGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATC +ACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACA +GGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACA +GATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTT +GGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGAT +GAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA +GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCA +ATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGAC +GGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACT +TTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGAT +TGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTT +GGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATC +ATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC +GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGG +ATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTTACCTTTTACAATTAAT +TGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGT +TGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTAC +GTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCC +AAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGA +CAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG +TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCT +ATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGC +AATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAG +CAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTA +GGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAG +CTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAA +GAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC +AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCC +AGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGC +CATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACAT +TCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAG +CAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTC +AACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATA +GTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT +TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGT +ACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAATTTTAGTAGTGCTATCCCCAT +GTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA diff --git a/data/FluA_H1N1.fa b/data/FluA_H1N1.fa new file mode 100644 index 0000000..119b0d2 --- /dev/null +++ b/data/FluA_H1N1.fa @@ -0,0 +1,182 @@ +>kraken:taxid|211044|NC_002023.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 1, complete sequence +AGCGAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAACTAAGAAATCTAATGTCGCAGTCTCGCACCCGCGA +GATACTCACAAAAACCACCGTGGACCATATGGCCATAATCAAGAAGTACACATCAGGAAGACAGGAGAAGAACCCAGCAC +TTAGGATGAAATGGATGATGGCAATGAAATATCCAATTACAGCAGACAAGAGGATAACGGAAATGATTCCTGAGAGAAAT +GAGCAAGGACAAACTTTATGGAGTAAAATGAATGATGCCGGATCAGACCGAGTGATGGTATCACCTCTGGCTGTGACATG +GTGGAATAGGAATGGACCAATGACAAATACAGTTCATTATCCAAAAATCTACAAAACTTATTTTGAAAGAGTCGAAAGGC +TAAAGCATGGAACCTTTGGCCCTGTCCATTTTAGAAACCAAGTCAAAATACGTCGGAGAGTTGACATAAATCCTGGTCAT +GCAGATCTCAGTGCCAAGGAGGCACAGGATGTAATCATGGAAGTTGTTTTCCCTAACGAAGTGGGAGCCAGGATACTAAC +ATCGGAATCGCAACTAACGATAACCAAAGAGAAGAAAGAAGAACTCCAGGATTGCAAAATTTCTCCTTTGATGGTTGCAT +ACATGTTGGAGAGAGAACTGGTCCGCAAAACGAGATTCCTCCCAGTGGCTGGTGGAACAAGCAGTGTGTACATTGAAGTG +TTGCATTTGACTCAAGGAACATGCTGGGAACAGATGTATACTCCAGGAGGGGAAGTGAAGAATGATGATGTTGATCAAAG +CTTGATTATTGCTGCTAGGAACATAGTGAGAAGAGCTGCAGTATCAGCAGACCCACTAGCATCTTTATTGGAGATGTGCC +ACAGCACACAGATTGGTGGAATTAGGATGGTAGACATCCTTAAGCAGAACCCAACAGAAGAGCAAGCCGTGGGTATATGC +AAGGCTGCAATGGGACTGAGAATTAGCTCATCCTTCAGTTTTGGTGGATTCACATTTAAGAGAACAAGCGGATCATCAGT +CAAGAGAGAGGAAGAGGTGCTTACGGGCAATCTTCAAACATTGAAGATAAGAGTGCATGAGGGATATGAAGAGTTCACAA +TGGTTGGGAGAAGAGCAACAGCCATACTCAGAAAAGCAACCAGGAGATTGATTCAGCTGATAGTGAGTGGGAGAGACGAA +CAGTCGATTGCCGAAGCAATAATTGTGGCCATGGTATTTTCACAAGAGGATTGTATGATAAAAGCAGTTAGAGGTGATCT +GAATTTCGTCAATAGGGCGAATCAGCGACTGAATCCTATGCATCAACTTTTAAGACATTTTCAGAAGGATGCGAAAGTGC +TTTTTCAAAATTGGGGAGTTGAACCTATCGACAATGTGATGGGAATGATTGGGATATTGCCCGACATGACTCCAAGCATC +GAGATGTCAATGAGAGGAGTGAGAATCAGCAAAATGGGTGTAGATGAGTACTCCAGCACGGAGAGGGTAGTGGTGAGCAT +TGACCGGTTCTTGAGAGTCCGGGACCAACGAGGAAATGTACTACTGTCTCCCGAGGAGGTCAGTGAAACACAGGGAACAG +AGAAACTGACAATAACTTACTCATCGTCAATGATGTGGGAGATTAATGGTCCTGAATCAGTGTTGGTCAATACCTATCAA +TGGATCATCAGAAACTGGGAAACTGTTAAAATTCAGTGGTCCCAGAACCCTACAATGCTATACAATAAAATGGAATTTGA +ACCATTTCAGTCTTTAGTACCTAAGGCCATTAGAGGCCAATACAGTGGGTTTGTGAGAACTCTGTTCCAACAAATGAGGG +ATGTGCTTGGGACATTTGATACCGCACAGATAATAAAACTTCTTCCCTTCGCAGCCGCTCCACCAAAGCAAAGTAGAATG +CAGTTCTCCTCATTTACTGTGAATGTGAGGGGATCAGGAATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTA +CAACAAGGCCACGAAGAGACTCACAGTTCTCGGAAAGGATGCTGGCACTTTAACCGAAGACCCAGATGAAGGCACAGCTG +GAGTGGAGTCCGCTGTTCTGAGGGGATTCCTCATTCTGGGCAAAGAAGACAGGAGATATGGGCCAGCATTAAGCATCAAT +GAACTGAGCAACCTTGCGAAAGGAGAGAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA +ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAGTGTCGAATAGTTT +AAAAACGACCTTGTTTCTACT +>kraken:taxid|211044|NC_002021.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 2, complete sequence +AGCGAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTTTTCTTAAAAGTGCCAGCACAAAATGCTATAAG +CACAACTTTCCCTTATACCGGAGACCCTCCTTACAGCCATGGGACAGGAACAGGATACACCATGGATACTGTCAACAGGA +CACATCAGTACTCAGAAAAGGCAAGATGGACAACAAACACCGAAACTGGAGCACCGCAACTCAACCCGATTGATGGGCCA +CTGCCAGAAGACAATGAACCAAGTGGTTATGCCCAAACAGATTGTGTATTGGAAGCAATGGCTTTCCTTGAGGAATCCCA +TCCTGGTATTTTTGAAAACTCGTGTATTGAAACGATGGAGGTTGTTCAGCAAACACGAGTAGACAAGCTGACACAAGGCC +GACAGACCTATGACTGGACTTTAAATAGAAACCAGCCTGCTGCAACAGCATTGGCCAACACAATAGAAGTGTTCAGATCA +AATGGCCTCACGGCCAATGAGTCTGGAAGGCTCATAGACTTCCTTAAGGATGTAATGGAGTCAATGAAAAAAGAAGAAAT +GGGGATCACAACTCATTTTCAGAGAAAGAGACGGGTGAGAGACAATATGACTAAGAAAATGATAACACAGAGAACAATAG +GTAAAAGGAAACAGAGATTGAACAAAAGGAGTTATCTAATTAGAGCATTGACCCTGAACACAATGACCAAAGATGCTGAG +AGAGGGAAGCTAAAACGGAGAGCAATTGCAACCCCAGGGATGCAAATAAGGGGGTTTGTATACTTTGTTGAGACACTGGC +AAGGAGTATATGTGAGAAACTTGAACAATCAGGGTTGCCAGTTGGAGGCAATGAGAAGAAAGCAAAGTTGGCAAATGTTG +TAAGGAAGATGATGACCAATTCTCAGGACACCGAACTTTCTTTGACCATCACTGGAGATAACACCAAATGGAACGAAAAT +CAGAATCCTCGGATGTTTTTGGCCATGATCACATATATGACCAGAAATCAGCCCGAATGGTTCAGAAATGTTCTAAGTAT +TGCTCCAATAATGTTCTCAAACAAAATGGCGAGACTGGGAAAAGGGTATATGTTTGAGAGCAAGAGTATGAAACTTAGAA +CTCAAATACCTGCAGAAATGCTAGCAAGCATTGATTTGAAATATTTCAATGATTCAACAAGAAAGAAGATTGAAAAAATC +CGACCGCTCTTAATAGAGGGGACTGCATCATTGAGCCCTGGAATGATGATGGGCATGTTCAATATGTTAAGCACTGTATT +AGGCGTCTCCATCCTGAATCTTGGACAAAAGAGATACACCAAGACTACTTACTGGTGGGATGGTCTTCAATCCTCTGACG +ATTTTGCTCTGATTGTGAATGCACCCAATCATGAAGGGATTCAAGCCGGAGTCGACAGGTTTTATCGAACCTGTAAGCTA +CATGGAATCAATATGAGCAAGAAAAAGTCTTACATAAACAGAACAGGTACATTTGAATTCACAAGTTTTTTCTATCGTTA +TGGGTTTGTTGCCAATTTCAGCATGGAGCTTCCCAGTTTTGGTGTGTCTGGGAGCAACGAGTCAGCGGACATGAGTATTG +GAGTTACTGTCATCAAAAACAATATGATAAACAATGATCTTGGTCCAGCAACAGCTCAAATGGCCCTTCAGTTGTTCATC +AAAGATTACAGGTACACGTACCGATGCCATAGAGGTGACACACAAATACAAACCCGAAGATCATTTGAAATAAAGAAACT +GTGGGAGCAAACCCGTTCCAAAGCTGGACTGCTGGTCTCCGACGGAGGCCCAAATTTATACAACATTAGAAATCTCCACA +TTCCTGAAGTCTGCCTAAAATGGGAATTGATGGATGAGGATTACCAGGGGCGTTTATGCAACCCACTGAACCCATTTGTC +AGCCATAAAGAAATTGAATCAATGAACAATGCAGTGATGATGCCAGCACATGGTCCAGCCAAAAACATGGAGTATGATGC +TGTTGCAACAACACACTCCTGGATCCCCAAAAGAAATCGATCCATCTTGAATACAAGTCAAAGAGGAGTACTTGAAGATG +AACAAATGTACCAAAGGTGCTGCAATTTATTTGAAAAATTCTTCCCCAGCAGTTCATACAGAAGACCAGTCGGGATATCC +AGTATGGTGGAGGCTATGGTTTCCAGAGCCCGAATTGATGCACGGATTGATTTCGAATCTGGAAGGATAAAGAAAGAAGA +GTTCACTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAAAAATAGTGAATTTAGCTTGTCCTTCATG +AAAAAATGCCTTGTTCCTACT +>kraken:taxid|211044|NC_002022.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 3, complete sequence +AGCGAAAGCAGGTACTGATCCAAAATGGAAGATTTTGTGCGACAATGCTTCAATCCGATGATTGTCGAGCTTGCGGAAAA +AACAATGAAAGAGTATGGGGAGGACCTGAAAATCGAAACAAACAAATTTGCAGCAATATGCACTCACTTGGAAGTATGCT +TCATGTATTCAGATTTCCACTTCATCAATGAGCAAGGCGAGTCAATAATCGTAGAACTTGGTGATCCTAATGCACTTTTG +AAGCACAGATTTGAAATAATCGAGGGAAGAGATCGCACAATGGCCTGGACAGTAGTAAACAGTATTTGCAACACTACAGG +GGCTGAGAAACCAAAGTTTCTACCAGATTTGTATGATTACAAGGAAAATAGATTCATCGAAATTGGAGTAACAAGGAGAG +AAGTTCACATATACTATCTGGAAAAGGCCAATAAAATTAAATCTGAGAAAACACACATCCACATTTTCTCGTTCACTGGG +GAAGAAATGGCCACAAAGGCCGACTACACTCTCGATGAAGAAAGCAGGGCTAGGATCAAAACCAGGCTATTCACCATAAG +ACAAGAAATGGCCAGCAGAGGCCTCTGGGATTCCTTTCGTCAGTCCGAGAGAGGAGAAGAGACAATTGAAGAAAGGTTTG +AAATCACAGGAACAATGCGCAAGCTTGCCGACCAAAGTCTCCCGCCGAACTTCTCCAGCCTTGAAAATTTTAGAGCCTAT +GTGGATGGATTCGAACCGAACGGCTACATTGAGGGCAAGCTGTCTCAAATGTCCAAAGAAGTAAATGCTAGAATTGAACC +TTTTTTGAAAACAACACCACGACCACTTAGACTTCCGAATGGGCCTCCCTGTTCTCAGCGGTCCAAATTCCTGCTGATGG +ATGCCTTAAAATTAAGCATTGAGGACCCAAGTCATGAAGGAGAGGGAATACCGCTATATGATGCAATCAAATGCATGAGA +ACATTCTTTGGATGGAAGGAACCCAATGTTGTTAAACCACACGAAAAGGGAATAAATCCAAATTATCTTCTGTCATGGAA +GCAAGTACTGGCAGAACTGCAGGACATTGAGAATGAGGAGAAAATTCCAAAGACTAAAAATATGAAAAAAACAAGTCAGC +TAAAGTGGGCACTTGGTGAGAACATGGCACCAGAAAAGGTAGACTTTGACGACTGTAAAGATGTAGGTGATTTGAAGCAA +TATGATAGTGATGAACCAGAATTGAGGTCGCTTGCAAGTTGGATTCAGAATGAGTTCAACAAGGCATGCGAACTGACAGA +TTCAAGCTGGATAGAGCTTGATGAGATTGGAGAAGATGTGGCTCCAATTGAACACATTGCAAGCATGAGAAGGAATTATT +TCACATCAGAGGTGTCTCACTGCAGAGCCACAGAATACATAATGAAGGGGGTGTACATCAATACTGCCTTACTTAATGCA +TCTTGTGCAGCAATGGATGATTTCCAATTAATTCCAATGATAAGCAAGTGTAGAACTAAGGAGGGAAGGCGAAAGACCAA +CTTGTATGGTTTCATCATAAAAGGAAGATCCCACTTAAGGAATGACACCGACGTGGTAAACTTTGTGAGCATGGAGTTTT +CTCTCACTGACCCAAGACTTGAACCACACAAATGGGAGAAGTACTGTGTTCTTGAGATAGGAGATATGCTTCTAAGAAGT +GCCATAGGCCAGGTTTCAAGGCCCATGTTCTTGTATGTGAGGACAAATGGAACCTCAAAAATTAAAATGAAATGGGGAAT +GGAGATGAGGCGTTGTCTCCTCCAGTCACTTCAACAAATTGAGAGTATGATTGAAGCTGAGTCCTCTGTCAAAGAGAAAG +ACATGACCAAAGAGTTCTTTGAGAACAAATCAGAAACATGGCCCATTGGAGAGTCTCCCAAAGGAGTGGAGGAAAGTTCC +ATTGGGAAGGTCTGCAGGACTTTATTAGCAAAGTCGGTATTTAACAGCTTGTATGCATCTCCACAACTAGAAGGATTTTC +AGCTGAATCAAGAAAACTGCTTCTTATCGTTCAGGCTCTTAGGGACAATCTGGAACCTGGGACCTTTGATCTTGGGGGGC +TATATGAAGCAATTGAGGAGTGCCTAATTAATGATCCCTGGGTTTTGCTTAATGCTTCTTGGTTCAACTCCTTCCTTACA +CATGCATTGAGTTAGTTGTGGCAGTGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTGTTTCTACT +>kraken:taxid|211044|NC_002017.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 4, complete sequence +AGCAAAAGCAGGGGAAAATAAAAACAACCAAAATGAAGGCAAACCTACTGGTCCTGTTATGTGCACTTGCAGCTGCAGAT +GCAGACACAATATGTATAGGCTACCATGCGAACAATTCAACCGACACTGTTGACACAGTGCTCGAGAAGAATGTGACAGT +GACACACTCTGTTAACCTGCTCGAAGACAGCCACAACGGAAAACTATGTAGATTAAAAGGAATAGCCCCACTACAATTGG +GGAAATGTAACATCGCCGGATGGCTCTTGGGAAACCCAGAATGCGACCCACTGCTTCCAGTGAGATCATGGTCCTACATT +GTAGAAACACCAAACTCTGAGAATGGAATATGTTATCCAGGAGATTTCATCGACTATGAGGAGCTGAGGGAGCAATTGAG +CTCAGTGTCATCATTCGAAAGATTCGAAATATTTCCCAAAGAAAGCTCATGGCCCAACCACAACACAACCAAAGGAGTAA +CGGCAGCATGCTCCCATGCGGGGAAAAGCAGTTTTTACAGAAATTTGCTATGGCTGACGGAGAAGGAGGGCTCATACCCA +AAGCTGAAAAATTCTTATGTGAACAAGAAAGGGAAAGAAGTCCTTGTACTGTGGGGTATTCATCACCCGTCTAACAGTAA +GGATCAACAGAATATCTATCAGAATGAAAATGCTTATGTCTCTGTAGTGACTTCAAATTATAACAGGAGATTTACCCCGG +AAATAGCAGAAAGACCCAAAGTAAGAGATCAAGCTGGGAGGATGAACTATTACTGGACCTTGCTAAAACCCGGAGACACA +ATAATATTTGAGGCAAATGGAAATCTAATAGCACCAAGGTATGCTTTCGCACTGAGTAGAGGCTTTGGGTCCGGCATCAT +CACCTCAAACGCATCAATGCATGAGTGTAACACGAAGTGTCAAACACCCCTGGGAGCTATAAACAGCAGTCTCCCTTTCC +AGAATATACACCCAGTCACAATAGGAGAGTGCCCAAAATACGTCAGGAGTGCCAAATTGAGGATGGTTACAGGACTAAGG +AACATTCCGTCCATTCAATCCAGAGGTCTATTTGGAGCCATTGCCGGTTTTATTGAAGGGGGATGGACTGGAATGATAGA +TGGATGGTACGGTTATCATCATCAGAATGAACAGGGATCAGGCTATGCAGCGGATCAAAAAAGCACACAAAATGCCATTA +ACGGGATTACAAACAAGGTGAACTCTGTTATCGAGAAAATGAACATTCAATTCACAGCTGTGGGTAAAGAATTCAACAAA +TTAGAAAAAAGGATGGAAAATTTAAATAAAAAAGTTGATGATGGATTTCTGGACATTTGGACATATAATGCAGAATTGTT +AGTTCTACTGGAAAATGAAAGGACTCTGGATTTCCATGACTCAAATGTGAAGAATCTGTATGAGAAAGTAAAAAGCCAAT +TAAAGAATAATGCCAAAGAAATCGGAAATGGATGTTTTGAGTTCTACCACAAGTGTGACAATGAATGCATGGAAAGTGTA +AGAAATGGGACTTATGATTATCCCAAATATTCAGAAGAGTCAAAGTTGAACAGGGAAAAGGTAGATGGAGTGAAATTGGA +ATCAATGGGGATCTATCAGATTCTGGCGATCTACTCAACTGTCGCCAGTTCACTGGTGCTTTTGGTCTCCCTGGGGGCAA +TCAGTTTCTGGATGTGTTCTAATGGATCTTTGCAGTGCAGAATATGCATCTGAGATTAGAATTTCAGAAATATGAGGAAA +AACACCCTTGTTTCTACT +>kraken:taxid|211044|NC_002019.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 5, complete sequence +AGCAAAAGCAGGGTAGATAATCACTCACTGAGTGACATCAAAATCATGGCGTCCCAAGGCACCAAACGGTCTTACGAACA +GATGGAGACTGATGGAGAACGCCAGAATGCCACTGAAATCAGAGCATCCGTCGGAAAAATGATTGGTGGAATTGGACGAT +TCTACATCCAAATGTGCACAGAACTTAAACTCAGTGATTATGAGGGACGGTTGATCCAAAACAGCTTAACAATAGAGAGA +ATGGTGCTCTCTGCTTTTGACGAAAGGAGAAATAAATACCTGGAAGAACATCCCAGTGCGGGGAAGGATCCTAAGAAAAC +TGGAGGACCTATATACAGAAGAGTAAACGGAAAGTGGATGAGAGAACTCATCCTTTATGACAAAGAAGAAATAAGGCGAA +TCTGGCGCCAAGCTAATAATGGTGACGATGCAACGGCTGGTCTGACTCACATGATGATCTGGCATTCCAATTTGAATGAT +GCAACTTATCAGAGGACAAGGGCTCTTGTTCGCACCGGAATGGATCCCAGGATGTGCTCTCTGATGCAAGGTTCAACTCT +CCCTAGGAGGTCTGGAGCCGCAGGTGCTGCAGTCAAAGGAGTTGGAACAATGGTGATGGAATTGGTCAGGATGATCAAAC +GTGGGATCAATGATCGGAACTTCTGGAGGGGTGAGAATGGACGAAAAACAAGAATTGCTTATGAAAGAATGTGCAACATT +CTCAAAGGGAAATTTCAAACTGCTGCACAAAAAGCAATGATGGATCAAGTGAGAGAGAGCCGGGACCCAGGGAATGCTGA +GTTCGAAGATCTCACTTTTCTAGCACGGTCTGCACTCATATTGAGAGGGTCGGTTGCTCACAAGTCCTGCCTGCCTGCCT +GTGTGTATGGACCTGCCGTAGCCAGTGGGTACGACTTTGAAAGAGAGGGATACTCTCTAGTCGGAATAGACCCTTTCAGA +CTGCTTCAAAACAGCCAAGTGTACAGCCTAATCAGACCAAATGAGAATCCAGCACACAAGAGTCAACTGGTGTGGATGGC +ATGCCATTCTGCCGCATTTGAAGATCTAAGAGTATTGAGCTTCATCAAAGGGACGAAGGTGGTCCCAAGAGGGAAGCTTT +CCACTAGAGGAGTTCAAATTGCTTCCAATGAAAATATGGAGACTATGGAATCAAGTACACTTGAACTGAGAAGCAGGTAC +TGGGCCATAAGGACCAGAAGTGGAGGAAACACCAATCAACAGAGGGCATCTGCGGGCCAAATCAGCATACAACCTACGTT +CTCAGTACAGAGAAATCTCCCTTTTGACAGAACAACCGTTATGGCAGCATTCACTGGGAATACAGAGGGGAGAACATCTG +ACATGAGGACCGAAATCATAAGGATGATGGAAAGTGCAAGACCAGAAGATGTGTCTTTCCAGGGGCGGGGAGTCTTCGAG +CTCTCGGACGAAAAGGCAGCGAGCCCGATCGTGCCTTCCTTTGACATGAGTAATGAAGGATCTTATTTCTTCGGAGACAA +TGCAGAGGAGTACGACAATTAAAGAAAAATACCCTTGTTTCTACT +>kraken:taxid|211044|NC_002018.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 6, complete sequence +AGCGAAAGCAGGGGTTTAAAATGAATCCAAATCAGAAAATAATAACCATTGGATCAATCTGTCTGGTAGTCGGACTAATT +AGCCTAATATTGCAAATAGGGAATATAATCTCAATATGGATTAGCCATTCAATTCAAACTGGAAGTCAAAACCATACTGG +AATATGCAACCAAAACATCATTACCTATAAAAATAGCACCTGGGTAAAGGACACAACTTCAGTGATATTAACCGGCAATT +CATCTCTTTGTCCCATCCGTGGGTGGGCTATATACAGCAAAGACAATAGCATAAGAATTGGTTCCAAAGGAGACGTTTTT +GTCATAAGAGAGCCCTTTATTTCATGTTCTCACTTGGAATGCAGGACCTTTTTTCTGACCCAAGGTGCCTTACTGAATGA +CAGGCATTCAAATGGGACTGTTAAGGACAGAAGCCCTTATAGGGCCTTAATGAGCTGCCCTGTCGGTGAAGCTCCGTCCC +CGTACAATTCAAGATTTGAATCGGTTGCTTGGTCAGCAAGTGCATGTCATGATGGCATGGGCTGGCTAACAATCGGAATT +TCAGGTCCAGATAATGGAGCAGTGGCTGTATTAAAATACAACGGCATAATAACTGAAACCATAAAAAGTTGGAGGAAGAA +AATATTGAGGACACAAGAGTCTGAATGTGCCTGTGTAAATGGTTCATGTTTTACTATAATGACTGATGGCCCGAGTGATG +GGCTGGCCTCGTACAAAATTTTCAAGATCGAAAAGGGGAAGGTTACTAAATCAATAGAGTTGAATGCACCTAATTCTCAC +TATGAGGAATGTTCCTGTTACCCTGATACCGGCAAAGTGATGTGTGTGTGCAGAGACAATTGGCATGGTTCGAACCGGCC +ATGGGTGTCTTTCGATCAAAACCTGGATTATCAAATAGGATACATCTGCAGTGGGGTTTTCGGTGACAACCCGCGTCCCA +AAGATGGAACAGGCAGCTGTGGTCCAGTGTATGTTGATGGAGCAAACGGAGTAAAGGGATTTTCATATAGGTATGGTAAT +GGTGTTTGGATAGGAAGGACCAAAAGTCACAGTTCCAGACATGGGTTTGAGATGATTTGGGATCCTAATGGATGGACAGA +GACTGATAGTAAGTTCTCTGTGAGGCAAGATGTTGTGGCAATGACTGATTGGTCAGGGTATAGCGGGAGTTTCGTTCAAC +ATCCTGAGCTAACAGGGCTAGACTGTATAAGGCCGTGCTTCTGGGTTGAATTAATCAGGGGACGACCTAAAGAAAAAACA +ATCTGGACTAGTGCGAGCAGCATTTCTTTTTGTGGCGTGAATAGTGATACTGTAGATTGGTCTTGGCCAGACGGTGCTGA +GTTGCCATTCACCATTGACAAGTAGTCTGTTCAAAAAACTCCTTGTTTCTACT +>kraken:taxid|211044|NC_002016.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 7, complete sequence +AGCGAAAGCAGGTAGATATTGAAAGATGAGTCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTATCATCCCGTCAGGCC +CCCTCAAAGCCGAGATCGCACAGAGACTTGAAGATGTCTTTGCAGGGAAGAACACCGATCTTGAGGTTCTCATGGAATGG +CTAAAGACAAGACCAATCCTGTCACCTCTGACTAAGGGGATTTTAGGATTTGTGTTCACGCTCACCGTGCCCAGTGAGCG +AGGACTGCAGCGTAGACGCTTTGTCCAAAATGCCCTTAATGGGAACGGGGATCCAAATAACATGGACAAAGCAGTTAAAC +TGTATAGGAAGCTCAAGAGGGAGATAACATTCCATGGGGCCAAAGAAATCTCACTCAGTTATTCTGCTGGTGCACTTGCC +AGTTGTATGGGCCTCATATACAACAGGATGGGGGCTGTGACCACTGAAGTGGCATTTGGCCTGGTATGTGCAACCTGTGA +ACAGATTGCTGACTCCCAGCATCGGTCTCATAGGCAAATGGTGACAACAACCAACCCACTAATCAGACATGAGAACAGAA +TGGTTTTAGCCAGCACTACAGCTAAGGCTATGGAGCAAATGGCTGGATCGAGTGAGCAAGCAGCAGAGGCCATGGAGGTT +GCTAGTCAGGCTAGGCAAATGGTGCAAGCGATGAGAACCATTGGGACTCATCCTAGCTCCAGTGCTGGTCTGAAAAATGA +TCTTCTTGAAAATTTGCAGGCCTATCAGAAACGAATGGGGGTGCAGATGCAACGGTTCAAGTGATCCTCTCGCTATTGCC +GCAAATATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCATTTACCGTCGCTTTAA +ATACGGACTGAAAGGAGGGCCTTCTACGGAAGGAGTGCCAAAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAGTG +CTGTGGATGCTGACGATGGTCATTTTGTCAGCATAGAGCTGGAGTAAAAAACTACCTTGTTTCTACT +>kraken:taxid|211044|NC_002020.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 8, complete sequence +AGCAAAAGCAGGGTGACAAAGACATAATGGATCCAAACACTGTGTCAAGCTTTCAGGTAGATTGCTTTCTTTGGCATGTC +CGCAAACGAGTTGCAGACCAAGAACTAGGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAAATCCCTAAGAGG +AAGGGGCAGCACTCTTGGTCTGGACATCGAGACAGCCACACGTGCTGGAAAGCAGATAGTGGAGCGGATTCTGAAAGAAG +AATCCGATGAGGCACTTAAAATGACCATGGCCTCTGTACCTGCGTCGCGTTACCTAACCGACATGACTCTTGAGGAAATG +TCAAGGGAATGGTCCATGCTCATACCCAAGCAGAAAGTGGCAGGCCCTCTTTGTATCAGAATGGACCAGGCGATCATGGA +TAAAAACATCATACTGAAAGCGAACTTCAGTGTGATTTTTGACCGGCTGGAGACTCTAATATTGCTAAGGGCTTTCACCG +AAGAGGGAGCAATTGTTGGCGAAATTTCACCATTGCCTTCTCTTCCAGGACATACTGCTGAGGATGTCAAAAATGCAGTT +GGAGTCCTCATCGGAGGACTTGAATGGAATGATAACACAGTTCGAGTCTCTGAAACTCTACAGAGATTCGCTTGGAGAAG +CAGTAATGAGAATGGGAGACCTCCACTCACTCCAAAACAGAAACGAGAAATGGCGGGAACAATTAGGTCAGAAGTTTGAA +GAAATAAGATGGTTGATTGAAGAAGTGAGACACAAACTGAAGGTAACAGAGAATAGTTTTGAGCAAATAACATTTATGCA +AGCCTTACATCTATTGCTTGAAGTGGAGCAAGAGATAAGAACTTTCTCATTTCAGCTTATTTAATAATAAAAAACACCCT +TGTTTCTACT diff --git a/data/FluA_H2N2.fa b/data/FluA_H2N2.fa new file mode 100644 index 0000000..a8856d3 --- /dev/null +++ b/data/FluA_H2N2.fa @@ -0,0 +1,180 @@ +>kraken:taxid|488241|NC_007378.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 1, complete sequence +AGCAAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAACTACGGAATCTGATGTCGCAGTCTCGCACTCGCGA +GATACTAACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCAGGGAGACAGGAAAAGAACCCGTCAC +TTAGGATGAAATGGATGATGGCAATGAAATATCCAATTACAGCTGACAAGAGGATAACAGAAATGGTTCCTGAGAGAAAT +GAGCAAGGACAAACTCTATGGAGTAAAATGAGTGATGCCGGGTCAGATCGAGTAATGGTATCACCTTTGGCAGTGACATG +GTGGAATAGAAATGGACCAATGACAAGTACGGTTCATTATCCAAAAATCTACAAGACTTATTTTGAGAAAGTCGAAAGGT +TAAAACATGGAACCTTTGGCCCTGTCCATTTTAGAAACCAAGTCAAAATACGCCGAAGAGTTGACATAAACCCTGGTCAT +GCAGACCTCAGTGCCAAGGAGGCACAAGACGTAATCATGGAAGTTGTTTTCCCCAATGAAGTGGGGGCCAGGATACTAAC +GTCGGAATCACAATTAACAATAACCAAAGAGAAAAAAGAAGAACTCCAAGATTGCAAAATTTCTCCTTTGATGGTTGCAT +ACATGTTAGAGAGAGAACTTGTCCGAAAAACGAGATTTCTCCCAGTTGCTGGTGGAACAAGCAGTGTGTACATTGAAGTG +TTACACTTGACTCAAGGAACATGTTGGGAACAGATGTACACCCCAGGTGGAGAAGTGAGGAATGATGATGTTGATCAAAG +TCTAATTATTGCAGCCAGGAACATAGTGAGAAGAGCAGCAGTATCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCC +ACAGCACACAGATTGGCGGGACAAGGATGGTGGACATTCTTAGGCAGAACCCAACGGAAGAACAAGCTGTGGATATATGC +AAGGCTGCAATGGGACTGAGAATCAGCTCATCCTTCAGTTTTGGCGGGTTCACATTTAAGAGAACAAGCGGGTCATCAAT +CAAGAGAGAGGAAGAAGTGCTTACGGGCAATCTCCAAACATTGAAAATAAGGGTGCATGAGGGGTACGAGGAATTCACAA +TGGTGGGGAAAAGGGCAACAGCTATACTCAGAAAAGCAACCAGGAGATTGGTTCAGCTGATAGTGAGTGGAAGAGACGAA +CAGTCAATAGCCGAAGCAATAATTGTAGCCATGGTGTTTTCACAAGAAGATTGCATGATAAAAGCAGTTAGAGGTGACCT +GAATTTCGTTAATAGGGCAAATCAGCGATTGAATCCCATGCATCAACTTTTAAGACATTTTCAGAAAGATGCAAAAGTGC +TCTTTCAAAATTGGGGAATTGAACATATCGACAATGTAATGGGAATGATTGGAGTATTACCAGACATGACTCCAAGCACA +GAGATGTCAATGAGAGGGATAAGAGTCAGCAAAATGGGCGTGGATGAATACTCCAGCACAGAGAGGGTAGTGGTAAGCAT +TGACCGGTTTTTGAGAGTTCGAGACCAACGAGGAAATGTACTACTATCTCCTGAGGAGGTCAGTGAAACACAGGGGACAG +AGAAACTGACAATAACTTACTCATCGTCAATGATGTGGGAGATTAATGGCCCTGAGTCAGTGTTGGTCAATACCTATCAG +TGGATCATCAGAAACTGGGAAACTGTTAAAATTCAATGGTCTCAGAATCCTACAATGCTATACAATAAAATGGAATTTGA +GCCATTTCAGTCTTTAGTTCCTAAGGCCATTAGAGGCCAATACAGTGGATTTGTTAGGACTCTATTCCAACAAATGAGGG +ATGTACTTGGGACATTTGATACCACCCAGATAATAAAGCTTCTTCCCTTTGCAGCCGCCCCACCAAAGCAAAGTAGAATG +CAGTTCTCTTCATTGACTGTGAATGTGAGGGGATCAGGAATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTA +CAACAAGACCACTAAGAGACTAACAATTCTCGGAAAGGATGCTGGCACTTTAACTGAAGACCCAGATGAAGGCACATCCG +GAGTGGAGTCCGCTGTTCTGAGAGGATTCCTCATTCTGGGCAAGGAAGATAGAAGATATGGACCAGCATTAAGCATCAAT +GAACTGAGTACCCTTGCAAAAGGAGAAAAGGCTAATGTACTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA +ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAATGTTGAATAGTTT +AAAAACGACCTTGTTTCTACT +>kraken:taxid|488241|NC_007375.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 2, complete sequence +AGCAAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAG +TACTACATTCCCTTATACTGGAGATCCTCCATACAGCCATGGGACAGGAACAGGATACACCATGGACACAGTCAACAGAA +CACATCAATATTCAGAAAAGGGGAAGTGGACAACAAACACGGAAACTGGAGCGCCCCAACTTAACCCAATTGATGGACCA +CTACCTGAGGACAATGAACCAAGTGGATATGCACAAACAGACTGCGTCCTGGAAGCAATGGCTTTCCTTGAGGAATCACA +CCCAGGAATCTTTGAAAATTCGTGTCTTGAAACGATGGAAGTTATTCAACAAACAAGAGTGGACAAACTGACCCAAGGTC +GTCAGACCTATGACTGGACATTGAACAGAAATCAGCCGGCTGCAACTGCGCTAGCCAACACTATAGAGGTCTTCAGATCG +AATGGACTGACAGCTAATGAGTCGGGAAGGCTAATAGATTTCCTCAAGGATGTGATAGAATCAATGGATAAAGAGGAGAT +GGAAATAACAACACACTTCCAAAGAAAAAGAAGAGTAAGAGACAACATGACCAAGAAAATGGTCACACAACGAACAATAG +GAAAGAAGAAGCAAAGATTGAACAAGAGAAGCTATCTGATAAGAGCACTGACATTGAACACAATGACTAAAGATGCAGAG +AGAGGTAAATTAAAAAGAAGAGCAATTGCAACACCCGGTATGCAGATCAGAGGGTTCGTGCACTTTGTCGAAACACTAGC +GAGAAATATTTGTGAGAAACTTGAACAGTCTGGGCTTCCGGTTGGAGGTAATGAAAAGAAGGCTAAACTAGCAAATGTTG +TTAGAAAAATGATGACTAATTCACAAGACACAGAGCTCTCTTTCACAATTACTGGAGACAACACCAAATGGAATGAGAAT +CAAAATCCTCGAGTGTTTCTGGCGATGATAACATACATCACAAGAAATCAACCTGAATGGTTTAGAAACGTCCTGAGCAT +TGCACCCATAATGTTCTCAAATAAAATGGCTAGACTAGGGAAAGGTTACATGTTCGAAAGCAAGAGCATGAAGCTCCGAA +CACAAATACCAGCAGAAATGCTAGCAAGTATTGACCTGAAATACTTTAATGAATCAACCAGAAAGAAAATTGAGAAAATA +AGGCCTCTCCTAATAGATGGCACAGTCTCATTGAGTCCTGGAATGATGATGGGCATGTTCAACATGCTAAGTACAGTCTT +AGGAGTCTCAATCCTGAATCTCGGGCAAAAGAAATACACCAAAACNACATACTGGTGGGACGGACTCCAATCCTCTGATG +ACTTCGCTCTCATAGTGAATGCACCAAATCATGAGGGAATACAAGCAGGGGTGAATAGATTCTACAGAACCTGCAAGCTA +GTCGGAATCAATATGAGCAAAAAGAAGTCCTACATAAATAGGACAGGGACATTTGAATTCACAAGCTTTTTCTATCGCTA +TGGATTTGTAGCCAATTTTAGCATGGAGCTGCCCAGCTTTGGAGTGTCTGGAATTAATGAATCGGCTGATATGAGCATTG +GGGTAACAGTGATAAAGAACAATATGATAAATAATGACCTTGGGCCAGCAACAGCCCAAATGGCTCTTCAACTATTCATC +AAAGACTACAGATACACGTACCGGTGCCACAGAGGGGACACACAAATTCAGACAAGGAGATCATTCGAGCTAAAGAAGCT +GTGGGAGCAAACCCGCTCAAAGGCAGGACTTTTGGTGTCGGATGGAGGATCAAACTTATACAATATCCGGAATCTCCACA +TTCCAGAAGTCTGCTTGAAATGGGAGCTAATGGATGAAGACTATCAGGGGAGGCTTTGTAATCCCCTGAATCCATTTGTC +AGTCATAAGGAAATTGAGTCTGTAAACAATGCTGTGGTAATGCCAGCTCACGGTCCAGCCAAGAGCATGGAATATGATGC +TGTTGCTACTACACACTCCTGGACCCCTAAGAGGAACCGCTCCATTCTCAACACAAGCCAAAGGGGAATTCTTGAAGATG +AACAGATGTATCAGAAGTGTTGCAATCTATTTGAGAAATTCTTCCCTAGCAGTTCGTACAGGAGACCAGTTGGAATTTCC +AGCATGGTGGAGGCCATGGTGTCTAGGGCTCGGATTGATGCACGGATTGACTTCGAGTCTGGACGGATTAAGAAAGAGGA +GTTCGCTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAAAAATAGTGAATTTAGCTTGTCCTTCATG +AAAAAATGCCTTGTTTCTACT +>kraken:taxid|488241|NC_007376.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 3, complete sequence +AGCAAAAGCAGGTACTGATTCGAAATGGAAGATTTTGTGCGACAATGCTTCAATCCGATGATTGTCGAACTTGCGGAAAA +GGCAATGAAAGAGTATGGAGAAGATCTGAAAATCGAAACAAACAAATTTGCAGCAATATGCACTCACTTGGAAGTATGCT +TCATGTATTCAGATTTTCATTTCATCAATGAGCAAGGCGAGTCAATAATGGTAGAGCTTGATGATCCAAATGCACTTTTG +AAGCACAGATTTGAAATAATAGAGGGAAGAGATCGCACAATGGCCTGGACAGTAGTAAACAGTATTTGCAACACCACAGG +AGCTGAGAAACCGAAGTTTCTGCCAGATTTGTATGATTACAAGGAGAATAGATTCATCGAGATTGGAGTGACAAGGAGAG +AAGTCCACATATACTATCTTGAAAAGGCCAATAAAATTAAATCTGAGAATACACACATCCACATTTTCTCATTCACTGGG +GAAGAAATGGCCACAAAGGCCGACTACACTCTCGATGAGGAAAGCAGGGCTAGGATCAAAACCAGACTATTCACCATAAG +ACAAGAAATGGCCAACAGAGGCCTCTGGGATTCCTTTCGTCAGTCCGAAAGAGGCGAAGAAACAATTGAAGAAAGATTTG +AAATCACAGGGACAATGCGCAGGCTTGCCGACCAAAGTCTCCCGCCGAACTTCTCCTGCCTTGAGAATTTTAGAGCCTAT +GTGGATGGATTCGAACCGAACGGCTACATTGAGGGCAAGCTTTCTCAAATGTCCAAAGAAGTAAATGCAAAAATTGAACC +TTTTCTGAAAACAACACCAAGACCAATTAGACTTCCGGATGGGCCTCCTTGTTTTCAGCGGTCCAAATTCCTGCTGATGG +ATGCTTTAAAATTAAGCATTGAGGACCCAAGTCACGAAGGGGAGGGAATACCACTATATGATGCGATCAAGTGCATGAGA +ACATTCTTTGGATGGAAAGAACCCTATATTGTTAAACCACACGAAAAGGGAATAAATCCAAATTATCTGCTGTCATGGAA +GCAAGTACTGGCGGAACTGCAGGACATTGAGAATGAGGAGAAGATTCCAAGAACTAAAAACATGAAGAAAACGAGTCAGC +TAAAGTGGGCACTTGGTGAGAACATGGCACCAGAGAAGGTAGACTTTGACAACTGTAGAGACATAAGCGATTTGAAGCAA +TATGATAGTGACGAACCTGAATTAAGGTCACTTTCAAGCTGGATCCAGAATGAGTTCAACAAGGCATGCGAGCTGACCGA +TTCAATCTGGATAGAGCTCGATGAGATTGGAGAAGACGTGGCTCCAATTGAACACATTGCAAGCATGAGAAGGAATTACT +TCACAGCAGAGGTGTCCCATTGCAGAGCCACAGAATATATAATGAAGGGGGTATACATTAATACTGCCTTGCTTAATGCA +TCCTGTGCAGCAATGGACGATTTCCAACTAATTCCCATGATAAGCAAGTGTAGAACTAAAGAGGGAAGGCGAAAGACCAA +TTTATATGGTTTCATCATAAAAGGAAGATCTCACTTAAGGAATGACACCGACGTGGTAAACTTTGTGAGCATGGAGTTTT +CTCTCACTGACCCGAGACTTGAGCCACACAAATGGGAGAAGTACTGTGTCCTTGAGATAGGAGATATGCTACTAAGAAGT +GCCATAGGCCAGATGTCAAGGCCTATGTTCTTGTATGTGAGGACAAATGGAACATCAAAGATTAAAATGAAATGGGGAAT +GGAGATGAGGCCTTGCCTCCTTCAGTCACTACAACAAATCGAGAGTATGGTTGAAGCCGAGTCCTCTGTCAAAGAGAAAG +ACATGACCAAAGAGTTTTTTGAGAATAAATCAGAAACATGGCCCATTGGGGAGTCCCCCAAAGGAGTGGAAGAAGGTTCC +ATTGGGAAGGTCTGCAGGACTTTATTAGCCAAGTCGGTATTCAATAGCCTGTATGCATCCCCACAATTAGAAGGATTTTC +AGCTGAATCAAGAAAACTGCTTCTTGTCGTTCAGGCTCTTAGGGACAATCTTGAACCTGGAACCTTTGATCTTGGGGGGC +TATATGAAGCAATTGAGGAGTGCCTGATTAATGATCCCTGGGTTTTGCTTAATGCGTCTTGGTTCAACTCCTTCCTAACA +CATGCATTAAGATAGTTGTGGCAATGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTGTTTCTACT +>kraken:taxid|488241|NC_007374.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 4, complete sequence +AGCAAAAGCAGGGGTTATACCATAGACAACCAAAAGCATAACAATGGCCATCATTTATCTCATACTCCTGTTCACAGCAG +TGAGGGGGGACCAGATATGCATTGGATACCATGCCAATAATTCCACAGAAAAGGTCGACACAATTCTAGAGCGGAATGTC +ACTGTGACTCATGCCAAGGACATCCTTGAGAAGACCCATAACGGAAAGCTATGCAAACTAAACGGAATCCCTCCACTTGA +ACTAGGGGACTGTAGCATTGCCGGATGGCTCCTTGGAAATCCAGAATGTGATAGGCTTCTAAGTGTGCCAGAATGGTCCT +ATATAATGGAGAAAGAAAACCCGAGATACAGTTTGTGTTACCCAGGCAGCTTCAATGACTATGAAGAATTGAAACATCTC +CTCAGCAGCGTGAAACATTTTGAGAAAGTTAAGATTTTGCCCAAAGATAGATGGACACAGCATACAACAACTGGAGGTTC +ATGGGCCTGCGCGGTGTCAGGTAAACCATCATTCTTCAGGAACATGGTCTGGCTGACACGTAAAGGATCAAATTATCCGG +TTGCCAAAGGATCGTACAACAATACAAGCGGAGAACAAATGCTAATAATTTGGGGAGTGCACCATCCTAATGATGAGGCA +GAACAAAGAGCATTGTACCAGAATGTGGGAACCTATGTTTCCGTAGCCACATCAACATTGTACAAAAGGTCAATCCCAGA +AATAGCAGCAAGGCCTAAAGTGAATGGACTAGGACGTAGAATGGAATTCTCTTGGACCCTCTTGGATATGTGGGACACCA +TAAATTTTGAGAGCACTGGTAATCTAGTTGCACCAGAGTATGGGTTCAAAATATCGAAAAGAGGTAGTTCAGGGATCATG +AAGACAGAAGGAACACTTGAGAACTGTGAAACCAAATGCCAAACTCCTTTGGGAGCAATAAATACAACACTACCTTTTCA +CAATGTCCACCCACTGACAATAGGTGAATGCCCCAAATATGTAAAATCGGAGAAATTGGTCTTAGCAACAGGACTAAGGA +ATGTTCCCCAGATTGAATCAAGAGGATTGTTTGGGGCAATAGCTGGTTTTATAGAAGGAGGATGGCAAGGAATGGTTGAT +GGTTGGTATGGATACCATCACAGCAATGACCAGGGATCAGGGTATGCAGCAGACAAAGAATCCACTCAAAAGGCATTTAA +TGGAATCACCAACAAGGTAAATTCTGTGATTGAAAAGATGAACACCCAATTTGAAGCTGTTGGGAAAGAATTCAGTAACT +TAGAGAAAAGACTGGAGAACTTGAACAAAAAGATGGAAGACGGGTTTCTAGATGTGTGGACATACAATGCAGAGCTTCTA +GTTCTGATGGAAAATGAGAGGACACTTGACTTTCATGATTCTAATGTCAAGAATCTGTATGATAAAGTCAGAATGCAGCT +GAGAGACAACGTCAAAGAACTAGGAAATGGATGTTTTGAATTTTATCACAAATGTGACAATGAATGCATGGATAGTGTGA +AAAACGGGACATATGATTATCCCAAGTATGAAGAAGAATCTAAACTAAATAGAAATGAAATCAAAGGGGTAAAATTGAGC +AGCATGGGGGTTTATCAAATCCTTGCCATTTATGCTACAGTAGCAGGTTCTCTGTCACTGGCAATCATGATGGCTGGGAT +CTCTTTCTGGATGTGCTCCAACGGGTCTCTGCAGTGCAGAATCTGCATATGATTGTAAGTCATTTTATAATTAAAAACAC +CCTTGTTTCCTGA +>kraken:taxid|488241|NC_007381.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 5, complete sequence +ATGGCGTCCCAAGGCACCAAACGGTCTTATGAACAGATGGAAACTGATGGGGAACGCCAGAATGCAACTGAGATCAGAGC +ATCCGTCGGGAAGATGATTGATGGAATTGGACGATTCTACATCCAAATGTGCACCGAACTTAAACTCAGTGATTATGAGG +GGCGACTGATCCAGAACAGCTTAACAATAGAGAGAATGGTGCTCTCTGCTTTTGACGAGAGAAGGAATAAATATCTGGAA +GAACATCCCAGCGCGGGGAAGGATCCTAAGAAAACTGGAGGACCCATATACAAGAGAGTAGATGGAAAGTGGATGAGGGA +ACTCGTCCTTTATGACAAAGAAGAAATAAGGCGAATCTGGCGCCAAGCCAATAATGGTGATGATGCAACAGCTGGGCTGA +CTCACATGATGATCTGGCATTCCAATTTGAATGATACAACATACCAGAGGACAAGAGCTCTTGTTCGCACCGGAATGGAT +CCCAGGATGTGCTCTTTGATGCAGGGTTCGACTCTCCCTAGGAGGTCTGGAGCTGCAGGCGCTGCAGTCAAAGGAGTTGG +GACAATGGTGATGGAGTTGATCAGGATGATCAAACGTGGGATCAATGATCGGAACTTCTGGAGAGGTGAGAATGGACGGA +AAACAAGGAGTGCTTACGAGAGAATGTGCAACATTCTCAAAGGAAAATTTCAAACAGCTGCACAAAGAGCAATGATGGAT +CAAGTGAGAGAAAGCCGGAACCCAGGAAATGCTGAGATCGAAGATCTAATCTTTCTGGCACGGTCTGCACTCATATTGAG +AGGGTCAGTTGCTCACAAATCTTGTCTGCCCGCCTGTGTGTATGGACCTGCCATAGCCAGTGGGTACAACTTCGAAAAAG +AGGGATACTCTCTAGTGGGAATAGACCCTTTCAAACTGCTTCAAAACAGCCAAGTATACAGCCTAATCAGACCGAACGAG +AATCCAGCACACAAGAGTCAGCTGGTGTGGATGGCATGCAATTCTGCTGCATTTGAAGATCTAAGAGTATTAAGCTTCAT +CAGAGGGACCAAAGTATCCCCAAGGGGGAAACTTTCCACTAGAGGAGTACAAATTGCTTCAAATGAAAACATGGATACTA +TGGAATCAAGTACTCTTGAACTAAGAAGCAGGTACTGGGCCATAAGGACCAGAAGTGGAGGAAACACTAATCAACAGAGG +GCCTCTGCAGGTCAAATCAGTGTACAACCTGCATTTTCTGTGCAAAGAAACCTCCCATTTGACAAACCAACCATCATGGC +AGCATTCACTGGGAATACAGAGGGAAGAACATCAGACATGAGGGCAGAAATCATAAGGATGATGGAAGGTGCAAAACCAG +AAGAAATGTCCTTCCAGGGGCGGGGAGTCTTCGAGCTCTCGGACGAAAAGGCAACGAACCCGATCGTGCCCTCTTTTGAC +ATGAGTAATGAAGGATCTTATTTCTTCGGAGACAATGCAGAGGAGTACGACAATTAA +>kraken:taxid|488241|NC_007382.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 6, complete sequence +ATGAATCCAAATCAAAAGATAATAACAATTGGCTCTGTCTCTCTCACCATTGCAACAGTATGCTTCCTCATGCAGATTGC +CATCCTGGTAACTACTGTGACATTGCATTTTAAGCAACATGAGTGCGACTCCCCCGCGAGCAACCAAGTAATGCCGTGTG +AACCAATAATAATAGAAAGGAACATAACAGAGATAGTGTATTTGAATAACACCACCATAGAGAAAGAGATCTGCCCCGAA +GTAGTGGAATACAGAAATTGGTCAAAGCCGCAATGTCAAATTACAGGATTTGCACCTTTTTCTAAGGACAATTCAATCCG +GCTTTCTGCTGGTGGGGACATTTGGGTGACGAGAGAACCTTATGTGTCATGCGATCCTGGCAAGTGTTATCAATTTGCAC +TCGGGCAGGGGACCACACTAGACAACAAACATTCAAATGACACAATACATGATAGAATCCCTCATCGAACCCTATTAATG +AATGAGTTGGGTGTTCCATTTCATTTAGGAACCAGGCAAGTGTGTGTAGCATGGTCCAGCTCAAGTTGTCACGATGGAAA +AGCATGGTTGCATGTTTGTGTCACTGGGGATGATAAAAATGCAACTGCTAGCTTCATTTATGACGGGAGGCTTATGGACA +GTATTGGTTCATGGTCTCAAAATATCCTCAGGACCCAGGAGTCGGAATGCGTTTGTATCAATGGGACTTGCACAGTAGTA +ATGACTGATGGAAGTGCTTCAGGAAGAGCCGATACTAGAATACTATTCATTGAAGAGGGGAAAATTGTCCATATTAGCCC +ATTGTCAGGAAGTGCTCAGCATGTAGAGGAGTGTTCCTGTTATCCTCGATATCCTGACGTCAGATGTATCTGCAGAGACA +ACTGGAAAGGCTCTAATAGGCCCGTCATAGACATAAATATGGAAGATTATAGCATTGATTCCAGTTATGTGTGCTCAGGG +CTTGTTGGCGACACACCCAGAAACGACGACAGATCTAGCAATAGTAATTGCAGGAATCCTAACAATGAGAGAGGGAATCC +AGGAGTGAAAGGCTGGGCCTTTGACAATGGAGATGACGTGTGGATGGGAAGAACGATCAGCAAGGATTTACGCTCAGGTT +ATGAAACTTTCAAAGTCATTGGTGGTTGGTCCACACCTAATTCCAAATCGCAGATCAATAGACAGGTCATAGTTGACAGC +AATAATTGGTCAGGTTACTCTGGTATTTTCTCTGTTGAGGGCAAAAGATGCATCAATAGGTGCTTTTATGTGGAGTTGAT +AAGGGGAAGGCAACAGGAGACTAGAGTATGGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACTTCAGGTACTTATG +GAACAGGCTCATGGCCTGATGGGGCGAACATCAATTTCATGCCTATATAA +>kraken:taxid|488241|NC_007377.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 7, complete sequence +AGCAAAAGCAGGTAGATATTGAAAGATGAGCCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTATCGTCCCGTCAGGCC +CCCTCAAAGCCGAGATCGCACAGAGACTTGAAGATGTCTTTGCTGGGAAGAACACAGATCTTGAGGCTCTCATGGAATGG +CTAAAGACAAGACCAATCCTGTCACCTCTGACTAAGGGGATTTTGGGATTTGTATTCACGCTCACCGTGCCAAGTGAGCG +AGGACTGCAGCGTAGACGCTTTGTCCAAAATGCCCTCAATGGGAATGGGGATCCAAATAACATGGACAGAGCAGTTAAAC +TGTATAGAAAGCTTAAGAGGGAGATAACATTCCATGGGGCCAAAGAAGTAGCGCTCAGTTATTCTGCTGGTGCACTTGCC +AGTTGCATGGGCCTCATATACAACAGGATGGGGGCTGTGACCACTGAAGTGGCCTTTGCCGTGGTATGTGCAACCTGTGA +ACAGATTGCTGACTCCCAGCATAGGTCTCACAGGCAAATGGTGACAACAACCAATCCACTAATAAGACATGAGAACAGAA +TGGTTCTGGCCAGCACTACAGCTAAGGCTATGGAGCAAATGGCTGGATCGAGTGAGCAAGCAGCAGAGGCCATGGAGGTT +GCTAGTCAGGCCAGGCAAATGGTGCAGGCAATGAGAGCCATTGGGACTCCTCCTAGCTCCAGTGCTGGTCTAAAAGATGA +TCTTCTTGAAAATTTGCAGGCCTATCAGAAACGAATGGGGGTGCAGATGCAACGATTCAAGTGACCCCCTTGTTGTTGCT +GCGAGTATCATTGGGATCTTGCACTTTATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCATTTATCGCTTCTTTAA +ACACGGTCTGAAAAGAGGGCCTTCTACGGAAGGAGTACCTGAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAGTG +CTGTGGATGCTGACGATAGTCATTTTGTCAGCATAGAGCTGGAGTAAAAAACTACCTTGTTTCTACT +>kraken:taxid|488241|NC_007380.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 8, complete sequence +ATGGATTCTAACACTGTGTCAAGTTTTCAGGTAGATTGCTTCCTTTGGCATGTCCGAAAACAAGTTGTAGACCAAGAACT +AGGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAAGTCCCTAAGGGGAAGAGGCAGCACTCTCGATCTAGACA +TCGAAGCAGCCACCCGTGTTGGAAAGCAGATAGTAGAGAGGATTCTGAAGGAAGAATCCGATGAGGCACTTAAAATGACC +ATGGCCTCCGCACCTGCTTCGCGATACCTAACTGACATGACTATTGAGGAATTGTCAAGGGACTGGTTCATGCTAATGCC +CAAGCAGAAAGTGGAAGGCCCTCTTTGCATCAGAATAGACCAGGCAATCATGGATAAGAACATCATGTTGAAAGCGAATT +TCAGTGTGATTTTTGACCGGCTAGAGACCCTAATATTACTAAGGGCTTTCACCGAAGAGGGAGCAATTGTTGGCGAAATT +TCACCATTGCCTTCTCTTCCAGGACATACTATTGAGGATGTCAAAAATGCAATTGGGGTCCTCATCGGAGGACTTGAATG +GAATGATAACACAGTTCGAGTCTCTAAAACTCTACAGAGATTCGCTTGGAGAAGCAGTAATGAGAATGGGAGACCTCCAC +TCACTCCAAAACAGAAACGGAAAATGGCGAGAACAATTAGGTCAAAAGTTCGAAGAGATAAGATGGCTGATTGAAGAAGT +GAGACACAGATTGAAGATAACAGAGAATAGTTTTGAGCAAATAACATTTATGCAAGCCTTACAGCTACTATTTGAAGTGG +AACAAGAGATAAGAACTTTCTCGTTTCAGCTTATTTAA diff --git a/data/FluA_H3N2.fa b/data/FluA_H3N2.fa new file mode 100644 index 0000000..092391b --- /dev/null +++ b/data/FluA_H3N2.fa @@ -0,0 +1,183 @@ +>kraken:taxid|335341|NC_007373.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 1, complete sequence +AGCAAAAGCAGGTCAATTATATTCAGTATGGAAAGAATAAAAGAACTACGGAACCTGATGTCGCAGTCTCGCACTCGCGA +GATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCGGGGAGACAGGAAAAGAACCCGTCAC +TTAGGATGAAATGGATGATGGCAATGAAATACCCAATCACTGCTGACAAAAGGATAACAGAAATGGTTCCGGAGAGAAAT +GAACAAGGACAAACTCTATGGAGTAAAATGAGTGATGCTGGATCAGATCGAGTGATGGTATCACCTTTGGCTGTAACATG +GTGGAATAGAAATGGACCCGTGGCAAGTACGGTCCATTACCCAAAAGTATACAAGACTTATTTTGACAAAGTCGAAAGGT +TAAAACATGGAACCTTTGGCCCTGTTCATTTTAGAAATCAAGTCAAGATACGCAGAAGAGTAGACATAAACCCTGGTCAT +GCAGACCTCAGTGCCAAAGAGGCACAAGATGTAATTATGGAAGTTGTTTTTCCCAATGAAGTGGGAGCCAGGATACTAAC +ATCAGAATCGCAATTAACAATAACTAAAGAGAAAAAAGAAGAACTCCGAGATTGCAAAATTTCTCCCTTGATGGTTGCAT +ACATGTTAGAGAGAGAACTTGTCCGAAAAACAAGATTTCTCCCAGTTGCTGGCGGAACAAGCAGTATATACATTGAAGTC +TTACATTTGACTCAAGGAACGTGTTGGGAACAAATGTACACTCCAGGTGGAGAAGTGAGGAATGACGATGTTGACCAAAG +CCTAATTATTGCGGCCAGGAACATAGTAAGAAGAGCTGCAGTATCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCC +ACAGCACACAAATTGGCGGGACAAGGATGGTGGACATTCTTAGACAGAACCCGACTGAAGAACAAGCTGTGGATATATGC +AAGGCTGCAATGGGATTGAGAATCAGCTCATCCTTCAGCTTTGGTGGGTTTACATTTAAAAGAACAAGCGGGTCATCAGT +CAAAAAAGAGGAAGAAGTGCTTACAGGCAATCTCCAAACATTGAAGATAAGAGTACATGAGGGGTATGAGGAGTTCACAA +TGGTGGGGAAAAGAGCAACAGCTATACTCAGAAAAGCAACCAGAAGATTGGTTCAGCTCATAGTGAGTGGAAGAGACGAA +CAGTCAATAGCCGAAGCAATAATCGTGGCCATGGTGTTTTCACAAGAGGATTGCATGATAAAAGCAGTTAGAGGTGACCT +GAATTTCGTCAACAGAGCAAATCAACGGTTGAACCCCATGCATCAGCTTTTAAGGCATTTTCAGAAAGATGCGAAAGTGC +TTTTTCAAAATTGGGGAATTGAACACATCGACAGTGTGATGGGAATGGTTGGAGTATTACCAGATATGACTCCAAGCACA +GAGATGTCAATGAGAGGAATAAGAGTCAGCAAAATGGGTGTGGATGAATACTCCAGTACAGAGAGGGTGGTGGTTAGCAT +TGATCGGTTTTTGAGAGTTCGAGACCAACGCGGGAATGTATTATTGTCTCCTGAGGAGGTCAGTGAAACACAGGGAACTG +AAAGATTGACAATAACATATTCATCGTCGATGATGTGGGAGATTAACGGTCCTGAGTCGGTTTTGGTCAATACCTATCAA +TGGATCATCAGAAATTGGGAAGCTGTCAAAATTCAATGGTCTCAGAATCCTGCAATGTTGTACAACAAAATGGAATTTGA +ACCATTTCAATCTTTAGTCCCCAAGGCCATTAGAAGCCAATACAGTGGGTTTGTCAGAACTCTATTCCAACAAATGAGAG +ACGTACTTGGGACATTTGACACCACCCAGATAATAAAGCTTCTCCCTTTTGCAGCCGCTCCACCAAAGCAAAGCAGAATG +CAGTTCTCTTCACTGACTGTAAATGTGAGGGGATCAGGGATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTA +CAACAAGACCACTAAAAGACTAACAATTCTCGGAAAAGATGCCGGCACTTTAATTGAAGACCCAGATGAAAGCACATCCG +GAGTGGAGTCCGCCGTCTTGAGAGGGTTTCTCATTATAGGTAAGGAAGACAGAAGATACGGACCAGCATTAAGCATCAAT +GAACTGAGTAACCTTGCAAAAGGGGAAAAGGCTAATGTGCTAATCGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA +ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAATGTTGAATAGTTT +AAAAACGACCTTGTTTCTACT +>kraken:taxid|335341|NC_007372.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 2, complete sequence +AGCAAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACTCTACTGTTCCTAAAGGTTCCAGCGCAAAATGCCATAAG +CACCACATTCCCTTATACTGGAGATCCTCCATACAGCCATGGAACAGGAACAGGATACACCATGGACACAGTCAACAGAA +CACACCAATATTCAGAGAAGGGGAAGTGGACGACAAATACAGAAACTGGGGCACCCCAACTCAACCCAATTGATGGACCA +CTACCTGAGGATAATGAGCCAAGTGGATATGCACAAACAGACTGTGTCCTGGAGGCTATGGCCTTCCTTGAAGAATCCCA +CCCAGGTATCTTTGAGAACTCATGCCTTGAAACAATGGAAGTCGTTCAACAAACAAGGGTGGACAAACTAACCCAAGGCC +GCCAGACTTATGATTGGACATTAAACAGAAATCAACCGGCAGCAACTGCATTAGCCAACACCATAGAAGTTTTTAGATCG +AATGGACTAACAGCCAATGAATCAGGAAGGCTAATAGATTTCCTCAAGGATGTGATGGAATCAATGGATAAAGAGGAAAT +GGAGATAACAACACACTTTCAAAGAAAAAGGAGAGTAAGAGACAACATGACCAAGAAAATGGTCACACAAAGAACAATAG +GGAAGAAAAAACAAAGAGTGAATAAGAGAGGCTATCTAATAAGAGCTTTGACATTGAACACGATGACCAAAGATGCAGAG +AGAGGTAAATTAAAAAGAAGGGCTATTGCAACACCCGGGATGCAAATTAGAGGGTTCGTGTACTTCGTTGAAACTTTAGC +TAGAAGCATTTGCGAAAAGCTTGAACAGTCTGGACTTCCGGTTGGGGGTAATGAAAAGAAGGCCAAACTGGCAAATGTTG +TGAGAAAAATGATGACTAATTCACAAGACACTGAGCTTTCTTTCACAATCACTGGGGACAACACTAAGTGGAATGAAAAT +CAAAACCCTCGAATGTTTTTGGCGATGATTACATATATCACAAAAAATCAACCTGAGTGGTTCAGAAACATCCTGAGCAT +CGCACCAATAATGTTCTCAAACAAAATGGCAAGACTAGGAAAAGGATACATGTTCGAGAGTAAGAGAATGAAGCTCCGAA +CACAAATACCCGCAGAAATGCTAGCAAGCATTGACCTGAAGTATTTCAATGAATCAACAAGGAAGAAAATTGAGAAAATA +AGGCCTCTTCTAATAGATGGCACAGCATCATTGAGCCCTGGGATGATGATGGGCATGTTCAACATGCTAAGTACGGTTTT +AGGAGTCTCGGTACTGAATCTTGGGCAAAAGAAATACACCAAGACAACATACTGGTGGGATGGGCTCCAATCCTCCGACG +ATTTTGCCCTCATAGTGAATGCACCAAATCATGAGGGAATACAAGCAGGAGTGGATAGATTCTACAGGACCTGCAAGTTA +GTGGGAATCAACATGAGCAAAAAGAAGTCCTATATAAATAAAACAGGGACATTTGAATTCACAAGCTTTTTTTATCGATA +TGGATTTGTGGCTAATTTTAGCATGGAGCTTCCCAGTTTTGGAGTGTCTGGAATAAACGAGTCAGCTGATATGAGTATTG +GAGTAACAGTGATAAAGAACAACATGATAAACAATGACCTTGGGCCAGCAACAGCCCAGATGGCTCTCCAATTGTTCATC +AAAGACTACAGATATACATATAGGTGCCATAGAGGAGACACACAAATTCAGACGAGAAGATCATTCGAGCTAAAGAAGCT +GTGGGATCAAACCCAATCAAGGGCAGGACTATTGGTATCAGATGGGGGACCAAACTTATACAATATCCGGAACCTTCACA +TCCCTGAAGTCTGCTTAAAGTGGGAGCTAATGGATGAGAATTATCGGGGAAGACTTTGTAACCCCCTGAATCCCTTTGTC +AGCCATAAAGAAATTGAGTCTGTAAACAATGCTGTAGTGATGCCAGCCCACGGTCCAGCCAAAAGTATGGAATATGATGC +CGTTGCAACTACACACTCCTGGAATCCCAAGAGGAACCGCTCTATTCTAAACACTAGCCAAAGGGGAATTCTTGAGGATG +AACAGATGTACCAAAAGTGCTGCAACTTGTTCGAGAAATTTTTCCCTAGTAGTTCATATAGGAGACCGATTGGAATTTCT +AGCATGGTGGAGGCCATGGTGTCTAGGGCCCGGATTGATGCCAGAATTGACTTCGAGTCTGGACGGATTAAGAAGGAAGA +GTTCTCTGAGATCATGAAGATCTGTTCCACCATTGAAGAACTCAGACGGCAAAAATAATGAATTTAGCTTGTCCTTCATG +AAAAAATGCCTTGTTTCTACT +>kraken:taxid|335341|NC_007371.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 3, complete sequence +AGCAAAAGCAGGTACTGATTCGAAATGGAAGATTTTGTGCGACAATGCTTCAACCCGATGATTGTCGAACTTGCAGAAAA +AGCAATGAAAGAGTATGGAGAGGATCTGAAAATTGAAACAAACAAATTTGCAGCAATATGCACCCACTTGGAGGTATGTT +TCATGTATTCAGATTTTCATTTCATCAATGAACAAGGCGAATCAATAGTGGTAGAACTTGATGATCCAAATGCACTGTTA +AAGCACAGATTTGAAATAATCGAGGGGAGAGACAGAACAATGGCCTGGACAGTAGTAAACAGTATCTGCAACACTACTGG +AGCAGAAAAACCAAAGTTTCTACCAGATTTGTATGATTACAAGGAGAATAGATTCATCGAAATTGGAGTGACAAGAAGAG +AAGTCCACATATATTACCTTGAAAAGGCCAATAAAATTAAATCTGAGAACACACACATTCACATCTTCTCATTCACTGGG +GAGGAAATAGCCACAAAGGCAGACTACACTCTCGACGAGGAAAGCAGGGCTAGGATTAAAACCAGGCTATTTACCATAAG +ACAAGAAATGGCCAACAGAGGCCTCTGGGATTCCTTTCGTCAGTCCGAAAGAGGCGAAGAAACAATTGAAGAAAAATTTG +AAATCTCAGGAACTATGCGTAGGCTTGCCGACCAAAGTCTCCCACCGAAATTCTCCTGCCTTGAGAATTTTAGAGCCTAT +GTGGATGGATTCGAACCGAACGGCTGCATTGAGGGCAAGCTTTCTCAAATGTCCAAAGAAGTGAATGCCAAAATTGAACC +TTTTCTGAAGACAACACCAAGACCAATCAAACTTCCTAATGGACCTCCTTGTTATCAGCGGTCCAAATTCCTCCTGATGG +ATGCTTTGAAATTGAGCATTGAAGACCCAAGTCATGAAGGAGAAGGGATTCCATTATATGATGCGATCAAGTGCATAAAA +ACATTCTTTGGATGGAAAGAACCTTATATAGTCAAACCACACGAAAAGGGAATAAATTCAAATTACCTGCTGTCATGGAA +GCAAGTATTGTCAGAATTGCAGGACATTGAAAATGAGGAGAAGATCCCAAGGACTAAAAACATGAAGAAAACGAGTCAAC +TAAAGTGGGCTCTTGGTGAAAACATGGCACCAGAGAAAGTAGACTTTGACAACTGCAGAGACATAAGCGATTTGAAGCAA +TATGATAGTGACGAACCTGAATTAAGGTCACTTTCAAGCTGGATACAGAATGAGTTCAACAAGGCCTGCGAGCTAACTGA +TTCAATCTGGATAGAGCTCGATGAAATTGGAGAGGACGTAGCCCCAATTGAGTACATTGCAAGCATGAGGAGGAATTATT +TCACAGCAGAGGTGTCCCATTGTAGAGCCACTGAGTACATAATGAAGGGGGTATACATTAATACTGCCCTGCTCAATGCA +TCCTGTGCAGCAATGGACGATTTTCAACTAATTCCCATGATAAGCAAGTGCAGAACTAAAGAGGGAAGGCGAAAAACCAA +TTTATATGGATTCATCATAAAGGGAAGATCTCATTTAAGGAATGACACAGATGTGGTAAACTTTGTGAGCATGGAGTTTT +CTCTCACTGACCCGAGACTTGAGCCACATAAATGGGAGAAATACTGTGTCCTTGAGATAGGAGATATGTTACTAAGAAGT +GCCATAGGCCAAATTTCAAGGCCTATGTTCTTGTATGTGAGGACAAACGGAACATCAAAGGTCAAAATGAAATGGGGAAT +GGAGATGAGACGTTGCCTCCTTCAGTCACTCCAGCAGATCGAGAGCATGATTGAAGCCGAGTCCTCGATTAAAGAGAAAG +ACATGACCAAAGAGTTTTTTGAGAATAAATCAGAAGCATGGCCCATTGGGGAGTCCCCCAAGGGAGTGGAAGAAGGTTCC +ATTGGGAAAGTCTGTAGGACTCTATTGGCTAAGTCAGTGTTCAATAGCCTGTATGCATCACCACAATTGGAAGGATTTTC +AGCGGAGTCAAGAAAACTGCTTCTTGTTGTTCAGGCTCTTAGGGACAACCTCGAACCTGGGACCTTTGATCTCGGGGGGC +TATATGAAGCAATTGAGGAGTGCCTGATTAATGATCCCTGGGTTTTGCTCAATGCATCTTGGTTCAACTCCTTCCTGACA +CATGCATTAAAATAGTTATGGCAGTGCTACTATTTGTTATCCGTACTGTCCAAAAAAGTACCTTGTTTCTACT +>kraken:taxid|335341|NC_007366.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 4, complete sequence +AGCAAAAGCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAA +AAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAAC +AATCACGAATGACCAAATTGAAGTCACTAATGCTACTGAACTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTC +CTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAAT +AAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCT +TAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAA +CAAGCTCTGCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATAC +CCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAACTGTACATTTGGGGGGTTCACCACCCGGGTACGGA +CAATGACCAAATCAGCCTATATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAATCC +CGAGTATCGGATCTAGACCCAGGATAAGGGATGTCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGAC +ATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAAT +GAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTC +AAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGA +AATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGA +CGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAACAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCA +ACCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACAAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAA +GTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCT +TGTGGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAAC +TGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGGTCAATC +AGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGTTGAA +GTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCA +TCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGAGTGCATTAATTAAAAACACCCTTGTTTCTA +CT +>kraken:taxid|335341|NC_007369.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 5, complete sequence +AGCAAAAGCAGGGTTAATAATCACTCACCGAGTGACATCAAAATCATGGCGTCCCAAGGCACCAAACGGTCTTATGAACA +GATGGAAACTGATGGGGATCGCCAGAATGCAACTGAGATTAGGGCATCCGTCGGGAAGATGATTGATGGAATTGGGAGAT +TCTACATCCAAATGTGCACTGAACTTAAACTCAGTGATCATGAAGGGCGGTTGATCCAGAACAGCTTGACAATAGAGAAA +ATGGTGCTCTCTGCTTTTGATGAAAGAAGGAATAAATACCTGGAAGAACACCCCAGCGCGGGGAAAGATCCCAAGAAAAC +TGGGGGGCCCATATACAGGAGAGTAGATGGAAAATGGATGAGGGAACTCGTCCTTTATGACAAAGAAGAGATAAGGCGAA +TCTGGCGCCAAGCCAACAATGGTGAGGATGCGACAGCTGGTCTAACTCACATAATGATCTGGCATTCCAATTTGAATGAT +GCAACATACCAGAGGACAAGAGCTCTTGTTCGAACTGGAATGGATCCCAGAATGTGCTCTCTGATGCAGGGCTCGACTCT +CCCTAGAAGGTCCGGAGCTGCAGGTGCTGCAGTCAAAGGAATCGGGACAATGGTGATGGAACTGATCAGAATGGTCAAAC +GGGGGATCAACGATCGAAATTTCTGGAGAGGTGAGAATGGGCGGAAAACAAGAAGTGCTTATGAGAGAATGTGCAACATT +CTTAAAGGAAAATTTCAAACAGCTGCACAAAGAGCAATGGTGGATCAAGTGAGAGAAAGTCGGAACCCAGGAAATGCTGA +GATCGAAGATCTCATATTTTTGGCAAGATCTGCATTGATATTGAGAGGGTCAGTTGCTCACAAATCTTGCCTACCTGCCT +GTGCGTATGGACCTGCAGTATCCAGTGGGTACGACTTCGAAAAAGAGGGATATTCCTTGGTGGGAATAGACCCTTTCAAA +CTACTTCAAAATAGCCAAATATACAGCCTAATCAGACCTAACGAGAATCCAGCACACAAGAGTCAGCTGGTGTGGATGGC +ATGCCATTCTGCTGCATTTGAAGATTTAAGATTGTTAAGCTTCATCAGAGGGACAAAAGTATCTCCGCGGGGGAAACTGT +CAACTAGAGGAGTACAAATTGCTTCAAATGAGAACATGGATAATATGGGATCGAGCACTCTTGAACTGAGAAGCGGGTAC +TGGGCCATAAGGACCAGGAGTGGAGGAAACACTAATCAACAGAGGGCCTCCGCAGGCCAAACCAGTGTGCAACCTACGTT +TTCTGTACAAAGAAACCTCCCATTTGAAAAGTCAACCATCATGGCAGCATTCACTGGAAATACGGAGGGAAGGACTTCAG +ACATGAGGGCAGAAATCATAAGAATGATGGAAGGTGCAAAACCAGAAGAAGTGTCATTCCGGGGGAGGGGAGTTTTCGAG +CTCTCAGACGAGAAGGCAACGAACCCGATCGTGCCCTCTTTTGATATGAGTAATGAAGGATCTTATTTCTTCGGAGACAA +TGCAGAAGAGTACGACAATTAAGGAAAAAATACCCTTGTTTCTACT +>kraken:taxid|335341|NC_007368.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 6, complete sequence +AGCAAAAGCAGGAGTAAAGATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATAT +GCTTCTTCATGCAAATTGCCATCCTGATAACCACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAAC +AACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGA +GAAGGAAATGTGCCCCAAACTAGCAGAATACAGAAATTGGTCAAAGCCGCAATGTGACATTACAGGATTTGCACCTTTTT +CTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGACCCTGAC +AAGTGTTACCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATGACACAGTACATGATAGGACCCC +TTATCGGACCCTATTGATGAATGAATTAGGTGTTCCATTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCT +CAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTGTAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTAC +AATGGGAGGCTTGTAGATAGTATTGTTTCATGGTCCAAAAAAATCCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAA +TGGAACTTGTACAGTAGTAATGACTGATGGGAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGA +AAATCATTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAGGAGTGCTCCTGCTATCCTCGATATCCTGGTGTC +AGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATAGGCCCATCGTAGATATAAACATAAAGGATTATAGCATTGTTTC +CAGTTATGTGTGCTCAGGGCTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGCTTGGATCCTA +ACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACGATCAGC +GAGAAGTTACGCTCAGGATATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAAACCTAATTCCAAATTGCAGATAAATAG +GCAAGTCATAGTTGACAGAGGTAATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGT +GCTTTTATGTGGAGTTGATAAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGT +GGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACATCAATCTCATGCCTATATAAGCTTTCGCAAT +TTTAGAAAAAAACTCCTTGTTTCTACT +>kraken:taxid|335341|NC_007367.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 7, complete sequence +AGCAAAAGCAGGTAGATATTGAAAGATGAGCCTTCTAACCGAGGTCGAAACGTATGTTCTCTCTATCGTTCCATCAGGCC +CCCTCAAAGCCGAGATCGCGCAGAGACTTGAAGATGTCTTTGCTGGGAAAAACACAGATCTTGAGGCTCTCATGGAATGG +CTAAAGACAAGACCAATTCTGTCACCTCTGACTAAGGGGATTTTGGGGTTTGTGTTCACGCTCACCGTGCCCAGTGAGCG +AGGACTGCAGCGTAGACGCTTTGTCCAAAATGCCCTCAATGGGAATGGAGATCCAAATAACATGGACAAAGCAGTTAAAC +TGTATAGGAAACTTAAGAGGGAGATAACGTTCCATGGGGCCAAAGAAATAGCTCTCAGTTATTCTGCTGGTGCACTTGCC +AGTTGCATGGGCCTCATATACAATAGGATGGGGGCTGTAACCACTGAAGTGGCATTTGGCCTGGTATGTGCAACATGTGA +ACAGATTGCTGACTCCCAGCACAGGTCTCATAGGCAAATGGTGGCAACAACCAATCCATTAATAAAACATGAGAACAGAA +TGGTTTTGGCCAGCACTACAGCTAAGGCTATGGAGCAAATGGCTGGATCAAGTGAGCAGGCAGCGGAGGCCATGGAAATT +GCTAGTCAGGCCAGGCAAATGGTGCAGGCAATGAGAGCCGTTGGGACTCATCCTAGCTCCAGTACTGGTCTAAGAGATGA +TCTTCTTGAAAATTTGCAGACCTATCAGAAACGAATGGGGGTGCAGATGCAACGATTCAAGTGACCCGCTTGTTGTTGCC +GCGAGTATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCGTCTATCGACTCTTCAA +ACACGGCCTTAAAAGAGGCCCTTCTACGGAAGGAGTACCTGAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAATG +CTGTGGATGCTGACGACAGTCATTTTGTCAGCATAGAGTTGGAGTAAAAAACTACCTTGTTTCTACT +>kraken:taxid|335341|NC_007370.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 8, complete sequence +AGCAAAAGCAGGGTGACAAAGACATAATGGATTCCAACACTGTGTCAAGTTTCCAGGTAGATTGCTTTCTTTGGCATATC +CGGAAACAAGTTGTAGACCAAGAACTGAGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAGGTCCCTAAGGGG +AAGAGGCAATACTCTCGGTCTAGACATCAAAGCAGCCACCCATGTTGGAAAGCAAATTGTAGAAAAGATTCTGAAAGAAG +AATCTGATGAGGCACTTAAAATGACCATGGTCTCCACACCTGCTTCGCGATACATAACTGACATGACTATTGAGGAATTG +TCAAGAAACTGGTTCATGCTAATGCCCAAGCAGAAAGTGGAAGGACCTCTTTGCATCAGAATGGACCAGGCAATCATGGA +GAAAAACATCATGTTGAAAGCGAATTTCAGTGTGATTTTTGACCGACTAGAGACCATAGTATTACTAAGGGCTTTCACCG +AAGAGGGAGCAATTGTTGGCGAAATCTCACCATTGCCTTCTTTTCCAGGACATACTATTGAGGATGTCAAAAATGCAATT +GGGGTCCTCATCGGAGGACTTGAATGGAATGATAACACAGTTCGAGTCTCTAAAAATCTACAGAGATTCGCTTGGAGAAG +CAGTAATGAGAATGGGGGACCTCCACTTACTCCAAAACAGAAACGGAAAATGGCGAGAACAGCTAGGTCAAAAGTTTGAA +GAGATAAGATGGCTGATTGAAGAAGTGAGACACAGACTAAAAACAACTGAAAATAGCTTTGAACAAATAACATTCATGCA +AGCATTACAACTGCTGTTTGAAGTGGAACAGGAGATAAGAACTTTCTCATTTCAGCTTATTTAATGATAAAAAACACCCT +TGTTTCTACT diff --git a/data/FluB.fa b/data/FluB.fa new file mode 100644 index 0000000..24250dc --- /dev/null +++ b/data/FluB.fa @@ -0,0 +1,192 @@ +>kraken:taxid|518987|NC_002204.1 Influenza B virus RNA 1, complete sequence +AGCAGAAGCGGAGCTTTAAGATGAATATAAATCCATATTTTCTTTTCATAGATGTACCTATACAGGCAGCAATTTCAACA +ACATTCCCATACACCGGTGTTCCCCCTTATTCTCATGGAACGGGAACAGGCTACACAATAGACACCGTGATTAGAACACA +CGAGTACTCAAACAAGGGAAAACAATACATTTCTGATGTTACAGGATGTGTAATGGTAGATCCAACAAATGGGCCATTAC +CCGAAGACAATGAACCGAGTGCCTATGCACAATTGGATTGTGTTCTGGAGGCTTTGGATAGAATGGATGAAGAACATCCA +GGTCTGTTTCAAGCAGGGTCACAGAATGCCATGGAGGCACTAATGGTCACAACAGTGGACAAATTGACTCAGGGGAGACA +GACCTTTGATTGGACGGTGTGTAGAAACCAACCTGCTGCAACGGCACTGAACACAACAATAACCTCTTTTAGGTTGAATG +ATTTAAATGGAGCCGACAAGGGTGGATTAGTGCCCTTTTGCCAAGATATCATTGATTCATTAGACAAACCTGAAATGATT +TTCTTCACAGTAAAGAATATAAAGAAAAAATTGCCTGCTAAAAACAGAAAGGGTTTCCTTATAAAAAGAATACCTATGAA +GGTAAAAGACAGAATAACAAGAGTGGAATACATCAAAAGAGCATTATCATTAAACACAATGACTAAAGATGCTGAAAGAG +GCAAACTAAAAAGAAGAGCAATTGCCACCGCTGGGATACAAATCAGAGGATTTGTATTAGTAGTTGAAAACTTGGCTAAA +AATATCTGTGAAAATCTAGAGCAAAGTGGTTTACCCGTAGGTGGAAACGAAAAGAAGGCCAAACTATCAAATGCAGTGGC +TAAAATGCTCAGTAATTGTCCACCAGGAGGGATCAGTATGACTGTGACAGGAGACAATACTAAATGGAATGAATGCTTAA +ATCCAAGAATCTTTTTGGCTATGACTGAAAGAATAACCAGAGACAGCCCAATTTGGTTCCGGGATTTTTGTAGTATAGCA +CCGGTCTTGTTCTCCAATAAAATAGCTAGATTGGGAAAAGGGTTCATGATAACAAGTAAAACAAAAAGACTAAAAGCTCA +AATACCTTGTCCCGATCTGTTTAATATACCATTAGAAAGATATAATGAAGAAACAAGGGCAAAACTGAAAAAGCTAAAAC +CTTTCTTCAATGAAGAAGGAACGGCATCTCTTTCGCCAGGAATGATGATGGGAATGTTTAATATGCTATCTACAGTATTA +GGAGTAGCCGCACTAGGGATAAAAAACATTGGAAACAAAGAATACTTATGGGATGGACTGCAGTCTTCGGATGATTTTGC +TCTGTTTGTTAATGCAAAAGATGAAGAGACATGTATGGAAGGAATAAACGATTTTTACCGAACATGTAAGCTATTGGGAA +TAAACATGAGCAAAAAGAAAAGTTACTGTAATGAAACTGGGATGTTTGAATTTACCAGCATGTTTTACAGAGATGGATTT +GTATCTAATTTTGCAATGGAACTCCCTTCATTTGGAGTCGCTGGAGTGAATGAATCAGCAGACATGGCAATAGGAATGAC +AATAATAAAGAACAATATGATCAACAATGGGATGGGCCCAGCAACGGCACAAACAGCCATACAATTATTCATAGCTGACT +ATAGATACACCTACAAATGCCACAGGGGAGATTCCAAAGTGGAAGGGAAGAGAATGAAAATTATAAAGGAGCTATGGGAA +AACACTAAAGGAAGAGATGGTCTATTAGTAGCAGATGGTGGGCCTAATCTTTACAATTTGAGAAACCTGCATATTCCAGA +AATAATATTAAAATACAACATAATGGACCCTGAGTACAAAGGACGGTTACTGCATCCTCAAAATCCCTTTGTAGGACATT +TGTCTATTGAGGGTATCAAAGAAGCAGATATAACACCTGCACATGGCCCAATAAAGAAAATGGACTACGATGCGGTATCT +GGAACTCATAGTTGGAGAACCAAAAGGAACAGATCTATACTAAACACTGATCAGAGGAACATGATTCTTGAGGAACAATG +CTACGCTAAGTGTTGCAACCTTTTTGAGGCTTGCTTTAACAGTGCGTCATACAGGAAACCAGTAGGCCAGCACAGCATGC +TTGAAGCTATGGCCCACAGATTAAGAATGGATGCACGACTGGACTATGAGTCAGGAAGGATGTCAAAAGAGGATTTCGAA +AAAGCAATGGCTCACCTTGGTGAGATTGGGTACATGTAAGCTCCGGAAATGTCTATGGGGTTATTGGTCATCGTTGAATA +CATGCGGTGCACAAATGATTAAAATGAAAAAAGGCTCGTGTTTCTACT +>kraken:taxid|518987|NC_002205.1 Influenza B virus (B/Lee/1940) segment 2, complete sequence +ATGACGTTGGCTAAAATTGAACTACTAAAGCAGCTGTTAAGGGACAATGAAGCCAAAACGGTGTTGAGACAGACAACGGT +AGACCAATACAACATAATAAGAAAATTCAATACATCAAGAATTGAAAAGAACCCTTCATTAAGAATGAAGTGGGCCATGT +GTTCCAATTTTCCCTTAGCTCTGACCAAGGGTGATATGGCAAATCGAATCCCCTTGGAATACAAGGGAATACAACTTAAA +ACAAATGCTGAAGACATAGGAACTAAAGGACAAATGTGTTCAATAGCAGCAGTTACCTGGTGGAATACATATGGGCCCAT +AGGGGATACTGAAGGGTTTGAAAAGGTCTACGAAAGCTTTTTTCTCAGAAAGATGAGACTTGACAATGCCACTTGGGGCC +GAATAACCTTTGGCCCTGTTGAGAGAGTAAGAAAAAGAGTACTACTAAACCCGCTCACCAAGGAAATGCCCCCAGATGAA +GCGAGCAATGTAATAATGGAAATATTATTCCCTAAAGAAGCAGGAATACCAAGAGAATCTACTTGGATACATAGAGAACT +GATAAAAGAAAAAAGAGAAAAATTGAAGGGAACGATGATAACTCCCATTGTACTGGCATACATGCTTGAGAGAGAACTAG +TTGCCCGAAGAAGGTTCCTGCCAGTAGCAGGAGCAACATCAGCAGAGTTCATAGAAATGCTACATTGCTTACAAGGTGAA +AATTGGAGACAAATATATCATCCAGGAGGGAATAAACTAACTGAATCTAGATCTCAATCAATGATTGTAGCTTGCAGGAA +GATAATCAGAAGATCAATAGTTGCATCAAACCCACTAGAGCTAGCTGTAGAGATTGCAAATAAGACTGTGATAGACACTG +AACCTTTAAAATCATGTCTGGCAGCCCTGGATGGAGGTGATGTAGCCTGTGACATAATAAGAGCTGCATTAGGATTAAAA +ATTAGACAAAGACAAAGATTTGGGAGACTTGAACTAAAGAGAATATCAGGAAGAGGATTCAAAAATGATGAAGAGATATT +AATCGGAAACGGAACAATACAAAAGATTGGAATATGGGACGGAGAAGAGGAATTCCATGTAAGATGTGGCGAATGCAGGG +GGATATTGAAAAAAAGCCAAATGAGAATGGAAAAACTACTGATAAATTCAGCCAAAAAGGAGGACATGAAAGATTTAATA +ATCTTATGCATGGTATTTTCTCAAGACACTAGGATGTTCCAAGGAGTGAGAGGAGAGATAAATTTTCTTAATCGAGCAGG +CCAACTTTTATCCCCCATGTACCAACTCCAACGATACTTTCTGAATAGGAGCAATGACCTTTTTGATCAATGGGGATATG +AGGAATCACCTAAAGCAAGTGAGCTACATGGGATAAATGAATTAATGAATGCATCTGACTATACATTGAAAGGGGTTGTA +GTAACAAAAAATGTGATTGATGATTTTAGTTCTACTGAAACAGAAAAAGTATCTATAACAAAAAATCTTAGTTTAATAAA +AAGGACTGGGGAAGTTATAATGGGAGCCAATGACGTAAGTGAATTAGAATCACAAGCACAGCTAATGATAACGTATGATA +CACCCAAGATGTGGGAAATGGGAACAACCAAAGAACTGGTACAAAACACTTACCAATGGGTGCTTAAAAATTTAGTAACA +TTGAAGGCTCAGTTTCTTTTGGGAAAAGAAGACATGTTCCAATGGGATGCATTTGAAGCATTTGAAAGCATAATCCCTCA +GAAGATGGCTGGTCAGTACAGTGGATTTGCAAGAGCAGTGCTCAAACAAATGAGAGACCAAGAGGTTATGAAAACTGACC +AATTCATAAAATTGTTGCCTTTCTGTTTTTCGCCACCAAAATTAAGGAGCAATGGAGAGCCTTATCAATTTTTGAGGCTT +ATGCTGAAAGGAGGAGGGGAAAATTTCATCGAAGTAAGGAAAGGGTCCCCCTTGTTCTCCTACAATCCACAAACGGAAAT +CCTAACTATATGCGGCAGAATGATGTCATTAAAAGGAAAAATTGAGGATGAAGAAAGAAATAGATCAATGGGGAATGCAG +TACTGGCAGGCTTTCTTGTTAGTGGCAAATATGACCCTGATCTTGGAGATTTCAAAACCATTGAGGAACTTGAAAGACTA +AAACCGGGAGAAAAAGCCAACATCTTACTTTACCAAGGAAAGCCCGTTAAAGTAGTTAAAAGGAAAAGATATAGTGCTTT +ATCCAATGATATTTCACAAGGGATTAAGAGACAAAGAATGACAGTTGAGTCCATGGGGTGGGCCTTGAGCTAA +>kraken:taxid|518987|NC_002206.1 Influenza B virus (B/Lee/1940) segment 3, complete sequence +ATGGATACTTTTATTACAAAGAATTTCCAGACTACAATAATACAAAAGGCCAAAAACACAATGGCAGAATTTAGTGAAGA +TCCTGAATTACAGCCAGCAGTACTATTCAACATCTGCGTCCATCTGGAGGTCTGCTATGTAATAAGTGATATGAACTTTC +TTGATGAGGAAGGAAAGACATATACAGCATTAGAAGGACAAGGAAAAGAGCAAAATTTGAGACCACAGTATGAAGTGATT +GAGGGAATGCCAAGAAACATAGCATGGATGGTTCAAAGATCCTTAGCCCAAGAGCATGGAATAGAGACTCCAAGGTATCT +GGCTGATTTATTTGATTATAAAACCAAGAGGTTTATCGAAGTCGGAATAACAAAGGGATTGGCTGATGATTACTTTTGGA +AAAAGAAAGAAAAGTTGGGGAATAGCATGGAACTGATGATATTCAGCTACAATCAAGACTACTCGTTAAGTGATGAATCT +TCATTGGATGAGGAAGGAAAAGGGAGAGTGCTAAGCAGACTCACAGAACTTCAGGCTGAGTTAAGTTTGAAAAACCTATG +GCAAGTTCTAATAGGGGAAGAAGAAATTGAAAAAGGAATTGACTTCAAACTTGGACAAACAATATCTAAACTGAGGAATA +TATCTGTTCCAGCTGGTTTCTCCAATTTTGAAGGGATGAGAAGTTACATAGACAACATAGACCCTAAAGGAGCAATAGAG +AGAAATCTAGCAAGGATGTCTCCCTTAGTATCAGTTACACCCAAAAAGTTGAAATGGGAGGACCTGAGACCCATAGGGCC +TCACATTTACAACCATGAGCTACCAGAAGTTCCATATAATGCCTTTCTCCTCATGTCTGATGAGTTGGGGCTGGCCAATA +TGACTGAAGGAAAGTCCAAGAAACCGAAGACCTTAGCTAAGGAATGTCTAGAAAGGTATTCAACACTACGTGATCAAACT +GACCCAATATTGATAATGAAAAGCGAAAAAGCTAACGAAAACTTCTTATGGAGGTTATGGAGGGACTGTGTAAATACAAT +AAGCAATGAGGAAACAGGCAACGAATTACAGAAAACCAATTATGCCAAGTGGGCCACAGGAGATGGACTAACATACCAAA +AAATAATGAAAGAAGTAGCAATAGATGACGAAACGATGTACCAAGAAGAACCCAAAATACCCAATAAATGTAGAGTGGCT +GCTTGGGTTCAGGCAGAGATGAATCTACTGAGTACTCTGACAAGTAAAAGGGCCCTGGATCTGCCAGAAATAGGGCCAGA +TGTAGCACCCGTGGAGCATGTAGGGAGTGAAAGAAGGAAATACTTTGTTAATGAAATCAACTACTGTAAAGCCTCTACAG +TTATGATGAAGTATGTACTTTTTCACACTTCATTATTAAATGAAAGCAATGCTAGTATGGGAAAATATAAAGTAATACCA +ATCACCAACAGAGTGGTAAATGAAAAAGGGGAAAGCTTTGACATGCTTTATGGTCTGGCGGTTAAGGGGCAATCTCATTT +GCGGGGGGACACGGATGTTGTAACAGTTGTGACTTTCGAGTTTAGTAGTACAGATCCTAGAGTGGACTCAGGAAAGTGGC +CAAAATATACTGTCTTTAAAATTGGCTCCCTATTTGTGAGTGGAAGAGAAAAACCTGTGTACCTATATTGCCGAGTGAAT +GGTACAAACAAAATCCAAATGAAATGGGGAATGGAAGCTAGAAGATGTCTGCTTCAATCAATGCAACAAATGGAGGCAAT +TGTTGATCAAGAATCATCGATACAAGGGTATGATATGACCAAAGCTTGTTTCAAGGGAGACAGAGTGAATAATCCCAAAA +CTTTCAGTATTGGGACTCAGGAAGGCAAACTAGTAAAAGGGTCCTTTGGGAAAGCACTAAGAGTAATATTCACCAAATGT +TTGATGCATTATGTATTTGGAAATGCTCAATTGGAGGGGTTTAGTGCCGAATCTAGGAGACTTCTACTGTTAATTCAGGC +ATTAAAAGACAGGAAGGGCCCTTGGGTATTTGACTTGGAGGGAATGTACTTTGGAGTAGAGGAATGTATTAGTAACAATC +CTTGGGTAATACAGAGTGCATACTGGTTTAATGAATGGTTGGGCATTGAAAAAGAAGGAAGTAAAGTGTTAGAATCAATA +GATGAAATAATGGATGAATGAACGAAGGGCATAGCGCTCAATTT +>kraken:taxid|518987|NC_002207.1 Influenza B virus (B/Lee/1940) segment 4, complete sequence +AGCAGAAGCGTTGCATTTTCTAATATCCACAAAATGAAGGCAATAATTGTACTACTCATGGTAGTAACATCCAATGCAGA +TCGAATCTGCACTGGGATAACATCGTCAAACTCACCTCATGTGGTTAAAACTGCCACTCAAGGGGAAGTCAATGTGACTG +GTGTGATACCACTAACAACAACACCTACCAAATCTCATTTTGCAAATCTCAAAGGAACACAGACCAGAGGAAAACTATGC +CCAAACTGTTTTAACTGCACAGATCTGGACGTGGCCCTAGGCAGACCAAAATGCATGGGGAACACACCCTCCGCAAAAGT +CTCAATACTCCATGAAGTCAAACCTGCTACATCTGGATGCTTTCCTATAATGCACGACAGAACAAAAATCAGACAACTAC +CTAATCTTCTCAGAGGATATGAAAACATCAGGTTATCAACCAGTAATGTTATCAATACAGAGACGGCACCAGGAGGACCC +TACAAGGTGGGGACCTCAGGATCTTGCCCTAACGTTGCTAATGGGAACGGCTTCTTCAACACAATGGCTTGGGTTATCCC +AAAAGACAACAACAAGACAGCAATAAATCCAGTAACAGTAGAAGTACCATACATTTGTTCAGAAGGGGAAGACCAAATTA +CTGTTTGGGGGTTCCACTCTGATGACAAAACCCAAATGGAAAGACTCTATGGAGACTCAAATCCTCAAAAGTTCACCTCA +TCTGCCAATGGAGTAACCACACATTATGTTTCTCAGATTGGTGGCTTCCCAAATCAAACAGAAGACGAAGGGCTAAAACA +AAGCGGCAGAATTGTTGTTGATTACATGGTACAAAAACCTGGAAAAACAGGAACAATTGTTTATCAAAGAGGCATTTTAT +TGCCTCAAAAAGTGTGGTGCGCAAGTGGCAGGAGCAAGGTAATAAAAGGGTCCTTGCCTTTAATTGGTGAAGCAGATTGC +CTCCACGAAAAGTACGGTGGATTAAATAAAAGCAAGCCTTACTACACAGGAGAGCATGCAAAGGCCATAGGAAATTGCCC +AATATGGGTGAAAACACCCTTGAAGCTGGCCAATGGAACCAAATATAGACCGCCTGCAAAACTATTAAAGGAAAGAGGTT +TCTTCGGAGCTATTGCTGGTTTCTTGGAAGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATACACATCTCATGGA +GCACATGGAGTGGCAGTGGCAGCAGACCTTAAGAGTACACAAGAAGCTATAAACAAGATAACAAAAAATCTCAACTATTT +AAGTGAGCTAGAAGTAAAAAACCTTCAAAGACTAAGCGGAGCAATGAATGAGCTTCACGACGAAATACTCGAGCTAGACG +AAAAAGTGGATGATCTAAGAGCTGATACAATAAGCTCACAAATAGAGCTTGCAGTCTTGCTTTCCAACGAAGGGATAATA +AACAGTGAAGATGAGCATCTCTTGGCACTTGAAAGAAAACTGAAGAAAATGCTTGGCCCCTCTGCTGTAGAAATAGGGAA +TGGGTGCTTTGAAACCAAACACAAATGCAACCAGACTTGCCTAGACAGGATAGCTGCTGGCACCTTTAATGCAGGAGATT +TTTCTCTTCCCACTTTTGATTCATTAAACATTACTGCTGCATCTTTAAATGATGATGGCTTGGATAATCATACTATACTG +CTCTACTACTCAACTGCTGCTTCTAGCTTGGCTGTAACATTAATGATAGCTATCTTCATTGTCTACATGGTCTCCAGAGA +CAATGTTTCTTGTTCCATCTGTCTGTGAGGGAGATTAAGCCCTGTGTTTTCCTTTACTGTAGTGCTCATTTGCTTGTCAC +CATTACAAAGAAACGTTATTGAAAAATGCTCTTGTTACTACT +>kraken:taxid|518987|NC_002208.1 Influenza B virus (B/Lee/1940) segment 5, complete sequence +GGCAGAAGCACAGCATTTTCTTGTGAGCTTCGAGCACTAATAAAACTGAAAATCAAAATGTCCAACATGGATATTGACAG +TATAAATACCGGAACAATCGATAAAACACCAGAAGAACTGACTCCCGGAACCAGTGGGGCAACCAGACCAATCATCAAGC +CAGCAACCCTTGCTCCGCCAAGCAACAAACGAACCCGAAATCCATCTCCAGAAAGGACAACCACAAGCAGTGAAACCGAT +ATCGGAAGGAAAATCCAAAAGAAACAAACCCCAACAGAGATAAAGAAGAGCGTCTACAAAATGGTGGTAAAACTGGGTGA +ATTCTACAACCAGATGATGGTCAAAGCTGGACTTAATGATGACATGGAAAGGAATCTAATTCAAAATGCACAAGCTGTGG +AGAGAATCCTATTGGCTGCAACTGATGACAAGAAAACTGAATACCAAAAGAAAAGGAATGCCAGAGATGTCAAAGAAGGG +AAGGAAGAAATAGACCACAACAAGACAGGAGGCACCTTTTATAAGATGGTAAGAGATGATAAAACCATCTACTTCAGCCC +TATAAAAATTACCTTTTTAAAAGAAGAGGTGAAAACAATGTACAAGACCACCATGGGGAGTGATGGTTTCAGTGGACTAA +ATCACATTATGATTGGACATTCACAGATGAACGATGTCTGTTTCCAAAGATCAAAGGGACTGAAAAGGGTTGGACTTGAC +CCTTCATTAATCAGTACTTTTGCCGGAAGCACACTACCCAGAAGATCAGGTACAACTGGTGTTGCAATCAAAGGAGGTGG +AACTTTAGTGGATGAAGCCATCCGATTTATAGGAAGAGCAATGGCAGACAGAGGGCTACTGAGAGACATCAAGGCCAAGA +CGGCCTATGAAAAGATTCTTCTGAATCTGAAAAACAAGTGCTCTGCGCCGCAACAAAAGGCTCTAGTTGATCAAGTGATC +GGAAGTAGGAACCCAGGGATTGCAGACATAGAAGACCTAACTCTGCTTGCCAGAAGCATGGTAGTTGTCAGACCCTCTGT +AGCGAGCAAAGTGGTGCTTCCCATAAGCATTTATGCTAAAATACCTCAACTAGGATTCAATACCGAAGAATACTCTATGG +TTGGGTATGAAGCCATGGCTCTTTATAATATGGCAACACCTGTTTCCATATTAAGAATGGGAGATGACGCAAAAGATAAA +TCTCAACTATTCTTCATGTCGTGCTTCGGAGCTGCCTATGAAGATCTAAGAGTGTTATCTGCACTAACGGGCACCGAATT +TAAGCCTAGATCAGCACTAAAATGCAAGGGTTTCCATGTCCCGGCTAAGGAGCAAGTAGAAGGAATGGGGGCAGCTCTGA +TGTCCATCAAGCTTCAGTTCTGGGCCCCAATGACCAGATCTGGAGGGAATGAAGTAAGTGGAGAAGGAGGGTCTGGTCAA +ATAAGTTGCAGCCCTGTGTTTGCAGTAGAAAGACCTATTGCTCTAAGCAAGCAAGCTGTAAGAAGAATGCTGTCAATGAA +CGTTGAAGGACGTGATGCAGATGTCAAAGGAAATCTACTCAAAATGATGAATGATTCAATGGCAAAGAAAACCAGTGGAA +ATGCTTTCATTGGGAAGAAAATGTTTCAAATATCAGACAAAAACAAAGTCAATCCCATTGAGATTCCAATTAAGCAGACC +ATCCCCAATTTCTTCTTTGGGAGGGACACAGCAGAGGATTATGATGACCTCGATTATTAAAGCAATAAAATAGACACTAT +GGCTGTGACTGTTTCAGTACGTTTGGGATGTGGGTGTTTACTCTTATTGAAATAAATGTAAAAAATGCTGTTGTTTCTAC +T +>kraken:taxid|518987|NC_002209.1 Influenza B virus (B/Lee/1940) segment 6, complete sequence +AGCAGAAGCAGAGCATATTCTTAGAACTGAAGTGAACAGGCCAAAAATGAACAATGCTACCTTCAACTGTACAAACATTA +ACCCTATTACTCACATCAGGGGGAGTATTATTATCACTATATGTGTCAGCCTCATTGTCATACTTATTGTATTCGGATGT +ATTGCTAAAATTTTCATCAACAAAAACAACTGCACCAACAATGTCATTAGAGTGCACAAACGCATCAAATGCCCAGACTG +TGAACCATTCTGCAACAAAAGAGATGACATTTCCACCCCCAGAGCCGGAGTGGACATACCCTCGTTTATCTTGCCAGGGC +TCAACCTTTCAGAAGGCACTCCTAATTAGCCCTCATAGGTTCGGAGAGATCAAAGGAAACTCAGCTCCCTTGATAATAAG +AGAACCTTTTGTTGCTTGTGGACCAAAAGAATGCAGACACTTTGCTCTGACCCATTATGCAGCTCAGCCGGGGGGATACT +ACAATGGAACAAGAAAGGACAGAAACAAGCTGAGGCATCTAGTATCAGTCAAATTGGGAAAAATCCCAACTGTGGAAAAC +TCCATTTTCCACATGGCAGCTTGGAGCGGATCCGCATGCCATGATGGTAGAGAATGGACATATATCGGAGTTGATGGTCC +TGACAATGATGCATTGGTCAAAATAAAATATGGAGAAGCATATACTGACACATATCATTCCTATGCACACAACATCCTAA +GAACACAAGAAAGTGCCTGCAATTGCATCGGGGGAGATTGTTATCTTATGATAACAGACGGCTCAGCTTCAGGAATTAGT +AAATGCAGATTTCTTAAAATTAGAGAGGGTCGAATAATAAAAGAAATACTTCCAACAGGAAGAGTGGAGCACACTGAAGA +GTGCACATGCGGGTTCGCCAGCAATAAAACCATAGAATGTGCCTGTAGAGACAACAGTTACACAGCAAAAAGACCCTTTG +TCAAATTAAATGTGGAAACTGATACAGCTGAAATAAGATTGATGTGCACAAAGACTTATCTAGACACTCCCAGACCGGAT +GATGGAAGCATAGCAGGGCCTTGCGAATCTAATGGAGACAAGTGGCTTGGAGGCATCAAAGGAGGATTCGTCCATCAAAG +AATGGCATCTAAGATTGGAAGATGGTACTCCCGAACGATGTCTAAAACTAACAGAATGGGGATGGAACTGTATGTAAAGT +ATGATGGTGACCCATGGACTGACAGTGATGCTCTTACTCTTAGTGGAGTAATGGTTTCCATAGAAGAACCTGGTTGGTAT +TCTTTTGGCTTCGAAATAAAGGACAAGAAATGTGATGTCCCTTGTATTGGGATAGAGATGGTACACGATGGTGGAAAAGA +TACTTGGCATTCAGCTGCAACAGCCATTTACTGTTTGATGGGCTCAGGACAATTGCTATGGGACACTGTCACAGGCGTTG +ATATGGCTTTATAATAGAGGAATGGTTGGATCTGTTCTAAACCCTTTGTTCCTATTTTATTTGAACAGTTGTTCTTACTA +GATTTAATTGTTTCTGAAAAATGCTCTTGTTACTACT +>kraken:taxid|518987|NC_002210.1 Influenza B virus (B/Lee/1940) segment 7, complete sequence +AGCAGAAGCACGCACTTTCTTAAAATGTCGCTGTTTGGAGACACAATTGCCTACCTGCTTTCACTAATAGAAGATGGAGA +AGGCAAAGCAGAACTAGCTGAAAAATTACACTGTTGGTTCGGTGGGAAAGAATTTGACCTAGATTCTGCTTTGGAATGGA +TAAAAAACAAAAGGTGCCTAACTGATATACAAAAAGCACTAATTGGTGCCTCTATATGCTTTTTAAAACCCAAAGACCAA +GAAAGAAAAAGGAGATTCATCACAGAGCCCCTGTCAGGAATGGGAACAACAGCAACAAAGAAGAAAGGCCTAATTCTAGC +TGAGAGAAAAATGAGAAGATGTGTAAGCTTTCATGAAGCATTTGAAATAGCAGAAGGCCACGAAAGCTCAGCATTACTAT +ATTGTCTTATGGTCATGTACCTAAACCCTGAAAACTATTCAATGCAAGTAAAACTAGGAACGCTCTGTGCTTTATGCGAG +AAACAAGCATCGCACTCGCATAGAGCCCATAGCAGAGCAGCAAGGTCTTCGGTACCTGGAGTAAGACGAGAAATGCAGAT +GGTTTCAGCTATGAACACAGCAAAGACAATGAATGGAATGGGAAAGGGAGAAGACGTCCAAAAACTAGCAGAAGAGCTGC +AAAACAACATTGGAGTGTTGAGATCTCTAGGAGCAAGTCAAAAGAATGGAGAAGGAATTGCCAAAGATGTAATGGAAGTG +CTAAAACAGAGCTCTATGGGAAATTCAGCTCTTGTGAGGAAATACTTATAATGCTCGAACCACTTCAGATTCTTTCAATT +TGTTCTTTCATTTTATCAGCTCTCCATTTCATGGCTTGGACAATAGGGCATTTGAATCAAATAAAAAGAGGGGTAAACTT +GAAAATACAAATAAGGAATCCAAATAAGGAGGCAATAAACAGAGAGGTGTCAATTCTGAGACACAATTACCAAAAGGAAA +TCCAAGCCAAAGAAACAATGAAGAAAATACTCTCTGACAACATGGAAGTATTGGGTGACCACATAGTAGTTGAAGGGCTT +TCAACTGATGAGATAATAAAAATGGGTGAAACAGTTTTGGAGGTGGAAGAATTGCAATGAGCCCAATTTTCACTGTATTT +CTTACTATGCATTTAAGCAAATTGTAATCAATGTCAGTGAATAAAACTGGAAAAAGTGCGTTGTTTCTACT +>kraken:taxid|518987|NC_002211.1 Influenza B virus (B/Lee/1940) segment 8, complete sequence +CGCAGAAGCAGAGGATTTATTTAGTCACTGGCAAACGGAAAGATGGCGGACAACATGACCACAACACAAATTGAGGTGGG +TCCGGGAGCAACCAATGCCACTATAAACTTTGAAGCAGGAATTCTGGAGTGCTATGAAAGGTTTTCATGGCAAAGAGCCC +TTGACTATCCTGGTCAAGACCGCCTACACAGACTAAAACGAAAATTAGAATCAAGAATAAAGACTCACAACAAGAGTGAG +CCTGAGAATAAAAGGATGTCTCTTGAAGAGAGAAAAGCAATTGGGGTAAAAATGATGAAAGTGCTTCTGTTTATGGATCC +CTCTGCTGGAATTGAAGGGTTTGAGCCATACTGTGTGAAAAATCCCTCAACTAGCAAATGTCCAAATTACGATTGGACCG +ATTACCCTCCAACCCCAGGAAAGTACCTTGATGACATAGAAGAAGAGCCGGAAAATGTCGATCACCCAATTGAGGTAGTA +TTAAGGGACATGAACAATAAAGATGCACGACAAAAGATAAAGGATGAAGTAAACACTCAGAAAGAGGGGAAATTCCGTTT +GACAATAAAAAGGGATATACGTAATGTGTTGTCCTTGAGAGTGTTGGTGAACGGAACCTTCCTCAAGCACCCTAATGGAG +ACAAGTCCTTATCAACTCTTCATAGATTGAATGCATATGACCAGAATGGAGGGCTTGTTGCTAAACTTGTTGCTACTGAT +GATCGGACAGTGGAGGATGAAAAAGATGGCCATCGGATCCTCAACTCACTCTTCGAGCGTTTTGATGAAGGACATTCAAA +GCCAATTCGAGCAGCTGAAACTGCGGTGGGAGTCTTATCCCAATTTGGTCAAGAGCACCGATTATCACCAGAAGAGGGAG +ACAATTAGACTGGCCACGGAAGAACTTTATCTCTTGAGTAAAAGAATTGATGATAGTATATTGTTCCACAAAACAGTAAT +AGCTAACAGCTCCATAATAGCTGACATGATTGTATCATTATCATTACTGGAAACATTGTATGAAATGAAGGATGTGGTTG +AAGTGTACAGCAGGCAGTGCTTATGAATGTAAAATAAAAATCCTCTTGTTACTACT diff --git a/data/HIV_1.fna b/data/HIV_1.fna new file mode 100644 index 0000000..6ed3f29 --- /dev/null +++ b/data/HIV_1.fna @@ -0,0 +1,116 @@ +>kraken:taxid|11676|NC_001802.1 Human immunodeficiency virus 1, complete genome +GGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGC +TTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTC +AGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAG +GACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAG +GCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAG +GCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTG +GCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTT +AGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGA +CAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCA +GCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGG +GTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACA +AGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAG +CTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGAC +ATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTA +TAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGAC +CAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAAT +TGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTAC +ACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCC +AAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGT +GGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCA +AATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTC +TTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAG +CAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAG +ATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAG +ATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTG +GACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGT +TGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAA +ACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAA +AAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTA +GATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAA +GAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTG +CATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGA +TCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCA +ATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATC +TGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCAT +CCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGG +GAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCAC +TAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACAT +GGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTA +TCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAA +CAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAG +GAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTT +AGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGG +AGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAG +AAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGC +ATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGG +AAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGA +ATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAAT +GGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAG +CCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTA +GCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTT +AAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCG +CCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAAT +AAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCAT +CCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATAC +AAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGG +AAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAG +AAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATT +AGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGA +AAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTC +TGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTA +GACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATT +AGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAG +CATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAG +AAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGA +CATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGC +CATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGA +CAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTT +GTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGC +AGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGT +ACATGTAATGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCA +TAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAA +GACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGA +TGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACC +ACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCAC +AGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGA +TGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAG +TGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTG +CTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAA +TAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTT +GAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGG +ACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCA +GTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACA +TCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATT +TGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAAC +AGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAA +ATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAA +TAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTA +TAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACA +GGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGA +CAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAA +GAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATG +GGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAG +GGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGG +AAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGG +AATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAA +TTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAG +ATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGA +GGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTT +TCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACA +GATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGC +TTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTG +GAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTG +AGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGC +TTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAA +TGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGC +AATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCA +GGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGC +TAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAAC +TACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGAT +AGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAG +TGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGC +TGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCG +AGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAG +CTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTC diff --git a/data/MERS.fa b/data/MERS.fa new file mode 100644 index 0000000..18de91c --- /dev/null +++ b/data/MERS.fa @@ -0,0 +1,378 @@ +>kraken:taxid|1335626|NC_019843.3 Middle East respiratory syndrome coronavirus, complete genome +GATTTAAGTGAATAGCTTGGCTATCTCACTTCCCCTCGTTCTCTTGCAGAACTTTGATTTTAACGAACTTAAATAAAAGC +CCTGTTGTTTAGCGTATCGTTGCACTTGTCTGGTGGGATTGTGGCATTAATTTGCCTGCTCATCTAGGCAGTGGACATAT +GCTCAACACTGGGTATAATTCTAATTGAATACTATTTTTCAGTTAGAGCGTCGTGTCTCTTGTACGTCTCGGTCACAATA +CACGGTTTCGTCCGGTGCGTGGCAATTCGGGGCACATCATGTCTTTCGTGGCTGGTGTGACCGCGCAAGGTGCGCGCGGT +ACGTATCGAGCAGCGCTCAACTCTGAAAAACATCAAGACCATGTGTCTCTAACTGTGCCACTCTGTGGTTCAGGAAACCT +GGTTGAAAAACTTTCACCATGGTTCATGGATGGCGAAAATGCCTATGAAGTGGTGAAGGCCATGTTACTTAAAAAGGAGC +CACTTCTCTATGTGCCCATCCGGCTGGCTGGACACACTAGACACCTCCCAGGTCCTCGTGTGTACCTGGTTGAGAGGCTC +ATTGCTTGTGAAAATCCATTCATGGTTAACCAATTGGCTTATAGCTCTAGTGCAAATGGCAGCCTGGTTGGCACAACTTT +GCAGGGCAAGCCTATTGGTATGTTCTTCCCTTATGACATCGAACTTGTCACAGGAAAGCAAAATATTCTCCTGCGCAAGT +ATGGCCGTGGTGGTTATCACTACACCCCATTCCACTATGAGCGAGACAACACCTCTTGCCCTGAGTGGATGGACGATTTT +GAGGCGGATCCTAAAGGCAAATATGCCCAGAATCTGCTTAAGAAGTTGATTGGCGGTGATGTCACTCCAGTTGACCAATA +CATGTGTGGCGTTGATGGAAAACCCATTAGTGCCTACGCATTTTTAATGGCCAAGGATGGAATAACCAAACTGGCTGATG +TTGAAGCGGACGTCGCAGCACGTGCTGATGACGAAGGCTTCATCACATTAAAGAACAATCTATATAGATTGGTTTGGCAT +GTTGAGCGTAAAGACGTTCCATATCCTAAGCAATCTATTTTTACTATTAATAGTGTGGTCCAAAAGGATGGTGTTGAAAA +CACTCCTCCTCACTATTTTACTCTTGGATGCAAAATTTTAACGCTCACCCCACGCAACAAGTGGAGTGGCGTTTCTGACT +TGTCCCTCAAACAAAAACTCCTTTACACCTTCTATGGTAAGGAGTCACTTGAGAACCCAACCTACATTTACCACTCCGCA +TTCATTGAGTGTGGAAGTTGTGGTAATGATTCCTGGCTTACAGGGAATGCTATCCAAGGGTTTGCCTGTGGATGTGGGGC +ATCATATACAGCTAATGATGTCGAAGTCCAATCATCTGGCATGATTAAGCCAAATGCTCTTCTTTGTGCTACTTGCCCCT +TTGCTAAGGGTGATAGCTGTTCTTCTAATTGCAAACATTCAGTTGCTCAGTTGGTTAGTTACCTTTCTGAACGCTGTAAT +GTTATTGCTGATTCTAAGTCCTTCACACTTATCTTTGGTGGCGTAGCTTACGCCTACTTTGGATGTGAGGAAGGTACTAT +GTACTTTGTGCCTAGAGCTAAGTCTGTTGTCTCAAGGATTGGAGACTCCATCTTTACAGGCTGTACTGGCTCTTGGAACA +AGGTCACTCAAATTGCTAACATGTTCTTGGAACAGACTCAGCATTCCCTTAACTTTGTGGGAGAGTTCGTTGTCAACGAT +GTTGTCCTCGCAATTCTCTCTGGAACCACAACTAATGTTGACAAAATACGCCAGCTTCTCAAAGGTGTCACCCTTGACAA +GTTGCGTGATTATTTAGCTGACTATGACGTAGCAGTCACTGCCGGCCCATTCATGGATAATGCTATTAATGTTGGTGGTA +CAGGATTACAGTATGCCGCCATTACTGCACCTTATGTAGTTCTCACTGGCTTAGGTGAGTCCTTTAAGAAAGTTGCAACC +ATACCGTATAAGGTTTGCAACTCTGTTAAGGATACTCTGGCTTATTATGCTCACAGCGTGTTGTACAGAGTTTTTCCTTA +TGACATGGATTCTGGTGTGTCATCCTTTAGTGAACTACTTTTTGATTGCGTTGATCTTTCAGTAGCTTCTACCTATTTTT +TAGTCCGCATCTTGCAAGATAAGACTGGCGACTTTATGTCTACAATTATTACTTCCTGCCAAACTGCTGTTAGTAAGCTT +CTAGATACATGTTTTGAAGCTACAGAAGCAACATTTAACTTCTTGTTAGATTTGGCAGGATTGTTCAGAATCTTTCTCCG +CAATGCCTATGTGTACACTTCACAAGGGTTTGTGGTGGTCAATGGCAAAGTTTCTACACTTGTCAAACAAGTGTTAGACT +TGCTTAATAAGGGTATGCAACTTTTGCATACAAAGGTCTCCTGGGCTGGTTCTAAAATCATTGCTGTTATCTACAGCGGC +AGGGAGTCTCTAATATTCCCATCGGGAACCTATTACTGTGTCACCACTAAGGCTAAGTCCGTTCAACAAGATCTTGACGT +TATTTTGCCTGGTGAGTTTTCCAAGAAGCAGTTAGGACTGCTCCAACCTACTGACAATTCTACAACTGTTAGTGTTACTG +TATCCAGTAACATGGTTGAAACTGTTGTGGGTCAACTTGAGCAAACTAATATGCATAGTCCTGATGTTATAGTAGGTGAC +TATGTCATTATTAGTGAAAAATTGTTTGTGCGTAGTAAGGAAGAAGACGGATTTGCCTTCTACCCTGCTTGCACTAATGG +TCATGCTGTACCGACTCTCTTTAGACTTAAGGGAGGTGCACCTGTAAAAAAAGTAGCCTTTGGCGGTGATCAAGTACATG +AGGTTGCTGCTGTAAGAAGTGTTACTGTCGAGTACAACATTCATGCTGTATTAGACACACTACTTGCTTCTTCTAGTCTT +AGAACCTTTGTTGTAGATAAGTCTTTGTCAATTGAGGAGTTTGCTGACGTAGTAAAGGAACAAGTCTCAGACTTGCTTGT +TAAATTACTGCGTGGAATGCCGATTCCAGATTTTGATTTAGACGATTTTATTGACGCACCATGCTATTGCTTTAACGCTG +AGGGTGATGCATCCTGGTCTTCTACTATGATCTTCTCTCTTCACCCCGTCGAGTGTGACGAGGAGTGTTCTGAAGTAGAG +GCTTCAGATTTAGAAGAAGGTGAATCAGAGTGCATTTCTGAGACTTCAACTGAACAAGTTGACGTTTCTCATGAGACTTC +TGACGACGAGTGGGCTGCTGCAGTTGATGAAGCGTTCCCTCTCGATGAAGCAGAAGATGTTACTGAATCTGTGCAAGAAG +AAGCACAACCAGTAGAAGTACCTGTTGAAGATATTGCGCAGGTTGTCATAGCTGACACCTTACAGGAAACTCCTGTTGTG +CCTGATACTGTTGAAGTCCCACCGCAAGTGGTGAAACTTCCGTCTGCACCTCAGACTATCCAGCCCGAGGTAAAAGAAGT +TGCACCTGTCTATGAGGCTGATACCGAACAGACACAGAATGTTACTGTTAAACCTAAGAGGTTACGCAAAAAGCGTAATG +TTGACCCTTTGTCCAATTTTGAACATAAGGTTATTACAGAGTGCGTTACCATAGTTTTAGGTGACGCAATTCAAGTAGCC +AAGTGCTATGGGGAGTCTGTGTTAGTTAATGCTGCTAACACACATCTTAAGCATGGCGGTGGTATCGCTGGTGCTATTAA +TGCGGCTTCAAAAGGGGCTGTCCAAAAAGAGTCAGATGAGTATATTCTGGCTAAAGGGCCGTTACAAGTAGGAGATTCAG +TTCTCTTGCAAGGCCATTCTCTAGCTAAGAATATCCTGCATGTCGTAGGCCCAGATGCCCGCGCTAAACAGGATGTTTCT +CTCCTTAGTAAGTGCTATAAGGCTATGAATGCATATCCTCTTGTAGTCACTCCTCTTGTTTCAGCAGGCATATTTGGTGT +AAAACCAGCTGTGTCTTTTGATTATCTTATTAGGGAGGCTAAGACTAGAGTTTTAGTCGTCGTTAATTCCCAAGATGTCT +ATAAGAGTCTTACCATAGTTGACATTCCACAGAGTTTGACTTTTTCATATGATGGGTTACGTGGCGCAATACGTAAAGCT +AAAGATTATGGTTTTACTGTTTTTGTGTGCACAGACAACTCTGCTAACACTAAAGTTCTTAGGAACAAGGGTGTTGATTA +TACTAAGAAGTTTCTTACAGTTGACGGTGTGCAATATTATTGCTACACGTCTAAGGACACTTTAGATGATATCTTACAAC +AGGCTAATAAGTCTGTTGGTATTATATCTATGCCTTTGGGATATGTGTCTCATGGTTTAGACTTAATGCAAGCAGGGAGT +GTCGTGCGTAGAGTTAACGTGCCCTACGTGTGTCTCCTAGCTAATAAAGAGCAAGAAGCTATTTTGATGTCTGAAGACGT +TAAGTTAAACCCTTCAGAAGATTTTATAAAGCACGTCCGCACTAATGGTGGTTACAATTCTTGGCATTTAGTCGAGGGTG +AACTATTGGTGCAAGACTTACGCTTAAATAAGCTCCTGCATTGGTCTGATCAAACCATATGCTACAAGGATAGTGTGTTT +TATGTTGTAAAGAATAGTACAGCTTTTCCATTTGAAACACTTTCAGCATGTCGTGCGTATTTGGATTCACGCACGACACA +GCAGTTAACAATCGAAGTCTTAGTGACTGTCGATGGTGTAAATTTTAGAACAGTCGTTCTAAATAATAAGAACACTTATA +GATCACAGCTTGGATGCGTTTTCTTTAATGGTGCTGATATTTCTGACACCATTCCTGATGAGAAACAGAATGGTCACAGT +TTATATCTAGCAGACAATTTGACTGCTGATGAAACAAAGGCGCTTAAAGAGTTATATGGCCCCGTTGATCCTACTTTCTT +ACACAGATTCTATTCACTTAAGGCTGCAGTCCATGGGTGGAAGATGGTTGTGTGTGATAAGGTACGTTCTCTCAAATTGA +GTGATAATAATTGTTATCTTAATGCAGTTATTATGACACTTGATTTATTGAAGGACATTAAATTTGTTATACCTGCTCTA +CAGCATGCATTTATGAAACATAAGGGCGGTGATTCAACTGACTTCATAGCCCTCATTATGGCTTATGGCAATTGCACATT +TGGTGCTCCAGATGATGCCTCTCGGTTACTTCATACCGTGCTTGCAAAGGCTGAGTTATGCTGTTCTGCACGCATGGTTT +GGAGAGAGTGGTGCAATGTCTGTGGCATAAAAGATGTTGTTCTACAAGGCTTAAAAGCTTGTTGTTACGTGGGTGTGCAA +ACTGTTGAAGATCTGCGTGCTCGCATGACATATGTATGCCAGTGTGGTGGTGAACGTCATCGGCAATTAGTCGAACACAC +CACCCCCTGGTTGCTGCTCTCAGGCACACCAAATGAAAAATTGGTGACAACCTCCACGGCGCCTGATTTTGTAGCATTTA +ATGTCTTTCAGGGCATTGAAACGGCTGTTGGCCATTATGTTCATGCTCGCCTGAAGGGTGGTCTTATTTTAAAGTTTGAC +TCTGGCACCGTTAGCAAGACTTCAGACTGGAAGTGCAAGGTGACAGATGTACTTTTCCCCGGCCAAAAATACAGTAGCGA +TTGTAATGTCGTACGGTATTCTTTGGACGGTAATTTCAGAACAGAGGTTGATCCCGACCTATCTGCTTTCTATGTTAAGG +ATGGTAAATACTTTACAAGTGAACCACCCGTAACATATTCACCAGCTACAATTTTAGCTGGTAGTGTCTACACTAATAGC +TGCCTTGTATCGTCTGATGGACAACCTGGCGGTGATGCTATTAGTTTGAGTTTTAATAACCTTTTAGGGTTTGATTCTAG +TAAACCAGTCACTAAGAAATACACTTACTCCTTCTTGCCTAAAGAAGACGGCGATGTGTTGTTGGCTGAGTTTGACACTT +ATGACCCTATTTATAAGAATGGTGCCATGTATAAAGGCAAACCAATTCTTTGGGTCAATAAAGCATCTTATGATACTAAT +CTTAATAAGTTCAATAGAGCTAGTTTGCGTCAAATTTTTGACGTAGCCCCCATTGAACTCGAAAATAAATTCACACCTTT +GAGTGTGGAGTCTACACCAGTTGAACCTCCAACTGTAGATGTGGTAGCACTTCAACAGGAAATGACAATTGTCAAATGTA +AGGGTTTAAATAAACCTTTCGTGAAGGACAATGTCAGTTTCGTTGCTGATGATTCAGGTACTCCCGTTGTTGAGTATCTG +TCTAAAGAAGACCTACATACATTGTATGTAGACCCTAAGTATCAAGTCATTGTCTTAAAAGACAATGTACTTTCTTCTAT +GCTTAGATTGCACACCGTTGAGTCAGGTGATATTAACGTTGTTGCAGCTTCCGGATCTTTGACACGTAAAGTGAAGTTAC +TATTTAGGGCTTCATTTTATTTCAAAGAATTTGCTACCCGCACTTTCACTGCTACCACTGCTGTAGGTAGTTGTATAAAG +AGTGTAGTGCGGCATCTAGGTGTTACTAAAGGCATATTGACAGGCTGTTTTAGTTTTGCCAAGATGTTATTTATGCTTCC +ACTAGCTTACTTTAGTGATTCAAAACTCGGCACCACAGAGGTTAAAGTGAGTGCTTTGAAAACAGCCGGCGTTGTGACAG +GTAATGTTGTAAAACAGTGTTGCACTGCTGCTGTTGATTTAAGTATGGATAAGTTGCGCCGTGTGGATTGGAAATCAACC +CTACGGTTGTTACTTATGTTATGCACAACTATGGTATTGTTGTCTTCTGTGTATCACTTGTATGTCTTCAATCAGGTCTT +ATCAAGTGATGTTATGTTTGAAGATGCCCAAGGTTTGAAAAAGTTCTACAAAGAAGTTAGAGCTTACCTAGGAATCTCTT +CTGCTTGTGACGGTCTTGCTTCAGCTTATAGGGCGAATTCCTTTGATGTACCTACATTCTGCGCAAACCGTTCTGCAATG +TGTAATTGGTGCTTGATTAGCCAAGATTCCATAACTCACTACCCAGCTCTTAAGATGGTTCAAACACATCTTAGCCACTA +TGTTCTTAACATAGATTGGTTGTGGTTTGCATTTGAGACTGGTTTGGCATACATGCTCTATACCTCGGCCTTCAACTGGT +TGTTGTTGGCAGGTACATTGCATTATTTCTTTGCACAGACTTCCATATTTGTAGACTGGCGGTCATACAATTATGCTGTG +TCTAGTGCCTTCTGGTTATTCACCCACATTCCAATGGCGGGTTTGGTACGAATGTATAATTTGTTAGCATGCCTTTGGCT +TTTACGCAAGTTTTATCAGCATGTAATCAATGGTTGCAAAGATACGGCATGCTTGCTCTGCTATAAGAGGAACCGACTTA +CTAGAGTTGAAGCTTCTACCGTTGTCTGTGGTGGAAAACGTACGTTTTATATCACAGCAAATGGCGGTATTTCATTCTGT +CGTAGGCATAATTGGAATTGTGTGGATTGTGACACTGCAGGTGTGGGGAATACCTTCATCTGTGAAGAAGTCGCAAATGA +CCTCACTACCGCCCTACGCAGGCCTATTAACGCTACGGATAGATCACATTATTATGTGGATTCCGTTACAGTTAAAGAGA +CTGTTGTTCAGTTTAATTATCGTAGAGACGGTCAACCATTCTACGAGCGGTTTCCCCTCTGCGCTTTTACAAATCTAGAT +AAGTTGAAGTTCAAAGAGGTCTGTAAAACTACTACTGGTATACCTGAATACAACTTTATCATCTACGACTCATCAGATCG +TGGCCAGGAAAGTTTAGCTAGGTCTGCATGTGTTTATTATTCTCAAGTCTTGTGTAAATCAATTCTTTTGGTTGACTCAA +GTTTGGTTACTTCTGTTGGTGATTCTAGTGAAATCGCCACTAAAATGTTTGATTCCTTTGTTAATAGTTTCGTCTCGCTG +TATAATGTCACACGCGATAAGTTGGAAAAACTTATCTCTACTGCTCGTGATGGCGTAAGGCGAGGCGATAACTTCCATAG +TGTCTTAACAACATTCATTGACGCAGCACGAGGCCCCGCAGGTGTGGAGTCTGATGTTGAGACCAATGAAATTGTTGACT +CTGTGCAGTATGCTCATAAACATGACATACAAATTACTAATGAGAGCTACAATAATTATGTACCCTCATATGTTAAACCT +GATAGTGTGTCTACCAGCGATTTAGGTAGTCTCATTGATTGTAATGCGGCTTCAGTTAACCAAATTGTCTTGCGTAATTC +TAATGGTGCTTGCATTTGGAACGCTGCTGCATATATGAAACTCTCGGATGCACTTAAACGACAGATTCGCATTGCATGCC +GTAAGTGTAATTTAGCTTTCCGGTTAACCACCTCAAAGCTACGCGCTAATGATAATATCTTATCAGTTAGATTCACTGCT +AACAAAATTGTTGGTGGTGCTCCTACATGGTTTAATGCGTTGCGTGACTTTACGTTAAAGGGTTATGTTCTTGCTACCAT +TATTGTGTTTCTGTGTGCTGTACTGATGTATTTGTGTTTACCTACATTTTCTATGGCACCTGTTGAATTTTATGAAGACC +GCATCTTGGACTTTAAAGTTCTTGATAATGGTATCATTAGGGATGTAAATCCTGATGATAAGTGCTTTGCTAATAAGCAC +CGGTCCTTCACACAATGGTATCATGAGCATGTTGGTGGTGTCTATGACAACTCTATCACATGCCCATTGACAGTTGCAGT +AATTGCTGGAGTTGCTGGTGCTCGCATTCCAGACGTACCTACTACATTGGCTTGGGTGAACAATCAGATAATTTTCTTTG +TTTCTCGAGTCTTTGCTAATACAGGCAGTGTTTGCTACACTCCTATAGATGAGATACCCTATAAGAGTTTCTCTGATAGT +GGTTGCATTCTTCCATCTGAGTGCACTATGTTTAGGGATGCAGAGGGCCGTATGACACCATACTGCCATGATCCTACTGT +TTTGCCTGGGGCTTTTGCGTACAGTCAGATGAGGCCTCATGTTCGTTACGACTTGTATGATGGTAACATGTTTATTAAAT +TTCCTGAAGTAGTATTTGAAAGTACACTTAGGATTACTAGAACTCTGTCAACTCAGTACTGCCGGTTCGGTAGTTGTGAG +TATGCACAAGAGGGTGTTTGTATTACCACAAATGGCTCGTGGGCCATTTTTAATGACCACCATCTTAATAGACCTGGTGT +CTATTGTGGCTCTGATTTTATTGACATTGTCAGGCGGTTAGCAGTATCACTGTTCCAGCCTATTACTTATTTCCAATTGA +CTACCTCATTGGTCTTGGGTATAGGTTTGTGTGCGTTCCTGACTTTGCTCTTCTATTATATTAATAAAGTAAAACGTGCT +TTTGCAGATTACACCCAGTGTGCTGTAATTGCTGTTGTTGCTGCTGTTCTTAATAGCTTGTGCATCTGCTTTGTTACCTC +TATACCATTGTGTATAGTACCTTACACTGCATTGTACTATTATGCTACATTCTATTTTACTAATGAGCCTGCATTTATTA +TGCATGTTTCTTGGTACATTATGTTCGGGCCTATCGTTCCCATATGGATGACCTGCGTCTATACAGTTGCAATGTGCTTT +AGACACTTCTTCTGGGTTTTAGCTTATTTTAGTAAGAAACATGTAGAAGTTTTTACTGATGGTAAGCTTAATTGTAGTTT +CCAGGACGCTGCCTCTAATATCTTTGTTATTAACAAGGACACTTATGCAGCTCTTAGAAACTCTTTAACTAATGATGCCT +ATTCACGATTTTTGGGGTTGTTTAACAAGTATAAGTACTTCTCTGGTGCTATGGAAACAGCCGCTTATCGTGAAGCTGCA +GCATGTCATCTTGCTAAAGCCTTACAAACATACAGCGAGACTGGTAGTGATCTTCTTTACCAACCACCCAACTGTAGCAT +AACCTCTGGCGTGTTGCAAAGCGGTTTGGTGAAAATGTCACATCCCAGTGGAGATGTTGAGGCTTGTATGGTTCAGGTTA +CCTGCGGTAGCATGACTCTTAATGGTCTTTGGCTTGACAACACAGTCTGGTGCCCACGACACGTAATGTGCCCGGCTGAC +CAGTTGTCTGATCCTAATTATGATGCCTTGTTGATTTCTATGACTAATCATAGTTTCAGTGTGCAAAAACACATTGGCGC +TCCAGCAAACTTGCGTGTTGTTGGTCATGCCATGCAAGGCACTCTTTTGAAGTTGACTGTCGATGTTGCTAACCCTAGCA +CTCCAGCCTACACTTTTACAACAGTGAAACCTGGCGCAGCATTTAGTGTGTTAGCATGCTATAATGGTCGTCCGACTGGT +ACATTCACTGTTGTAATGCGCCCTAACTACACAATTAAGGGTTCCTTTCTGTGTGGTTCTTGTGGTAGTGTTGGTTACAC +CAAGGAGGGTAGTGTGATCAATTTCTGTTACATGCATCAAATGGAACTTGCTAATGGTACACATACCGGTTCAGCATTTG +ATGGTACTATGTATGGTGCCTTTATGGATAAACAAGTGCACCAAGTTCAGTTAACAGACAAATACTGCAGTGTTAATGTA +GTAGCTTGGCTTTACGCAGCAATACTTAATGGTTGCGCTTGGTTTGTAAAACCTAATCGCACTAGTGTTGTTTCTTTTAA +TGAATGGGCTCTTGCCAACCAATTCACTGAATTTGTTGGCACTCAATCCGTTGACATGTTAGCTGTCAAAACAGGCGTTG +CTATTGAACAGCTGCTTTATGCGATCCAACAACTGTATACTGGGTTCCAGGGAAAGCAAATCCTTGGCAGTACCATGTTG +GAAGATGAATTCACACCTGAGGATGTTAATATGCAGATTATGGGTGTGGTTATGCAGAGTGGTGTGAGAAAAGTTACATA +TGGTACTGCGCATTGGTTGTTTGCGACCCTTGTCTCAACCTATGTGATAATCTTACAAGCCACTAAATTTACTTTGTGGA +ACTACTTGTTTGAGACTATTCCCACACAGTTGTTCCCACTCTTATTTGTGACTATGGCCTTCGTTATGTTGTTGGTTAAA +CACAAACACACCTTTTTGACACTTTTCTTGTTGCCTGTGGCTATTTGTTTGACTTATGCAAACATAGTCTACGAGCCCAC +TACTCCCATTTCGTCAGCGCTGATTGCAGTTGCAAATTGGCTTGCCCCCACTAATGCTTATATGCGCACTACACATACTG +ATATTGGTGTCTACATTAGTATGTCACTTGTATTAGTCATTGTAGTGAAGAGATTGTACAACCCATCACTTTCTAACTTT +GCGTTAGCATTGTGCAGTGGTGTAATGTGGTTGTACACTTATAGCATTGGAGAAGCCTCAAGCCCCATTGCCTATCTGGT +TTTTGTCACTACACTCACTAGTGATTATACGATTACAGTCTTTGTTACTGTCAACCTTGCAAAAGTTTGCACTTATGCCA +TCTTTGCTTACTCACCACAGCTTACACTTGTGTTTCCGGAAGTGAAGATGATACTTTTATTATACACATGTTTAGGTTTC +ATGTGTACTTGCTATTTTGGTGTCTTCTCTCTTTTGAACCTTAAGCTTAGAGCACCTATGGGTGTCTATGACTTTAAGGT +CTCAACACAAGAGTTCAGATTCATGACTGCTAACAATCTAACTGCACCTAGAAATTCTTGGGAGGCTATGGCTCTGAACT +TTAAGTTAATAGGTATTGGCGGTACACCTTGTATAAAGGTTGCTGCTATGCAGTCTAAACTTACAGATCTTAAATGCACA +TCTGTGGTTCTCCTCTCTGTGCTCCAACAGTTACACTTAGAGGCTAATAGTAGGGCCTGGGCTTTCTGTGTTAAATGCCA +TAATGATATATTGGCAGCAACAGACCCCAGTGAGGCTTTCGAGAAATTCGTAAGTCTCTTTGCTACTTTAATGACTTTTT +CTGGTAATGTAGATCTTGATGCGTTAGCTAGTGATATTTTTGACACTCCTAGCGTACTTCAAGCTACTCTTTCTGAGTTT +TCACACTTAGCTACCTTTGCTGAGTTGGAAGCTGCGCAGAAAGCCTATCAGGAAGCTATGGACTCTGGTGACACCTCACC +ACAAGTTCTTAAGGCTTTGCAGAAGGCTGTTAATATAGCTAAAAACGCCTATGAGAAGGATAAGGCAGTGGCCCGTAAGT +TAGAACGTATGGCTGATCAGGCTATGACTTCTATGTATAAGCAAGCACGTGCTGAAGACAAGAAAGCAAAAATTGTCAGT +GCTATGCAAACTATGTTGTTTGGTATGATTAAGAAGCTCGACAACGATGTTCTTAATGGTATCATTTCTAACGCTAGGAA +TGGTTGTATACCTCTTAGTGTCATCCCACTGTGTGCTTCAAATAAACTTCGCGTTGTAATTCCTGACTTCACCGTCTGGA +ATCAGGTAGTCACATATCCCTCGCTTAACTACGCTGGGGCTTTGTGGGACATTACAGTTATAAACAATGTGGACAATGAA +ATTGTTAAGTCTTCAGATGTTGTAGACAGCAATGAAAATTTAACATGGCCACTTGTTTTAGAATGCACTAGGGCATCCAC +TTCTGCCGTTAAGTTGCAAAATAATGAGATCAAACCTTCAGGTCTAAAAACCATGGTTGTGTCTGCGGGTCAAGAGCAAA +CTAACTGTAATACTAGTTCCTTAGCTTATTACGAACCTGTGCAGGGTCGTAAAATGCTGATGGCTCTTCTTTCTGATAAT +GCCTATCTCAAATGGGCGCGTGTTGAAGGTAAGGACGGATTTGTCAGTGTAGAGCTACAACCTCCTTGCAAATTCTTGAT +TGCGGGACCAAAAGGACCTGAAATCCGATATCTCTATTTTGTTAAAAATCTTAACAACCTTCATCGCGGGCAAGTGTTAG +GGCACATTGCTGCGACTGTTAGATTGCAAGCTGGTTCTAACACCGAGTTTGCCTCTAATTCCTCGGTGTTGTCACTTGTT +AACTTCACCGTTGATCCTCAAAAAGCTTATCTCGATTTCGTCAATGCGGGAGGTGCCCCATTGACAAATTGTGTTAAGAT +GCTTACTCCTAAAACTGGTACAGGTATAGCTATATCTGTTAAACCAGAGAGTACAGCTGATCAAGAGACTTATGGTGGAG +CTTCAGTGTGTCTCTATTGCCGTGCGCATATAGAACATCCTGATGTCTCTGGTGTTTGTAAATATAAGGGTAAGTTTGTC +CAAATCCCTGCTCAGTGTGTCCGTGACCCTGTGGGATTTTGTTTGTCAAATACCCCCTGTAATGTCTGTCAATATTGGAT +TGGATATGGGTGCAATTGTGACTCGCTTAGGCAAGCAGCACTGCCCCAATCTAAAGATTCCAATTTTTTAAACGAGTCCG +GGGTTCTATTGTAAATGCCCGAATAGAACCCTGTTCAAGTGGTTTGTCCACTGATGTCGTCTTTAGGGCATTTGACATCT +GCAACTATAAGGCTAAGGTTGCTGGTATTGGAAAATACTACAAGACTAATACTTGTAGGTTTGTAGAATTAGATGACCAA +GGGCATCATTTAGACTCCTATTTTGTCGTTAAGAGGCATACTATGGAGAATTATGAACTAGAGAAGCACTGTTACGACTT +GTTACGTGACTGTGATGCTGTAGCTCCCCATGATTTCTTCATCTTTGATGTAGACAAAGTTAAAACACCTCATATTGTAC +GTCAGCGTTTAACTGAGTACACTATGATGGATCTTGTATATGCCCTGAGGCACTTTGATCAAAATAGCGAAGTGCTTAAG +GCTATCTTAGTGAAGTATGGTTGCTGTGATGTTACCTACTTTGAAAATAAACTCTGGTTTGATTTTGTTGAAAATCCCAG +TGTTATTGGTGTTTATCATAAACTTGGAGAACGTGTACGCCAAGCTATCTTAAACACTGTTAAATTTTGTGACCACATGG +TCAAGGCTGGTTTAGTCGGTGTGCTCACACTAGACAACCAGGACCTTAATGGCAAGTGGTATGATTTTGGTGACTTCGTA +ATCACTCAACCTGGTTCAGGAGTAGCTATAGTTGATAGCTACTATTCTTATTTGATGCCTGTGCTCTCAATGACCGATTG +TCTGGCCGCTGAGACACATAGGGATTGTGATTTTAATAAACCACTCATTGAGTGGCCACTTACTGAGTATGATTTTACTG +ATTATAAGGTACAACTCTTTGAGAAGTACTTTAAATATTGGGATCAGACGTATCACGCAAATTGCGTTAATTGTACTGAT +GACCGTTGTGTGTTACATTGTGCTAATTTCAATGTATTGTTTGCTATGACCATGCCTAAGACTTGTTTCGGACCCATAGT +CCGAAAGATCTTTGTTGATGGCGTGCCATTTGTAGTATCTTGTGGTTATCACTACAAAGAATTAGGTTTAGTCATGAATA +TGGATGTTAGTCTCCATAGACATAGGCTCTCTCTTAAGGAGTTGATGATGTATGCCGCTGATCCAGCCATGCACATTGCC +TCCTCTAACGCTTTTCTTGATTTGAGGACATCATGTTTTAGTGTCGCTGCACTTACAACTGGTTTGACTTTTCAAACTGT +GCGGCCTGGCAATTTTAACCAAGACTTCTATGATTTCGTGGTATCTAAAGGTTTCTTTAAGGAGGGCTCTTCAGTGACGC +TCAAACATTTTTTCTTTGCTCAAGATGGTAATGCTGCTATTACAGATTATAATTACTATTCTTATAATCTGCCTACTATG +TGTGACATCAAACAAATGTTGTTCTGCATGGAAGTTGTAAACAAGTACTTCGAAATCTATGACGGTGGTTGTCTTAATGC +TTCTGAAGTGGTTGTTAATAATTTAGACAAGAGTGCTGGCCATCCTTTTAATAAGTTTGGCAAAGCTCGTGTCTATTATG +AGAGCATGTCTTACCAGGAGCAAGATGAACTTTTTGCCATGACAAAGCGTAACGTCATTCCTACCATGACTCAAATGAAT +CTAAAATATGCTATTAGTGCTAAGAATAGAGCTCGCACTGTTGCAGGCGTGTCCATACTTAGCACAATGACTAATCGCCA +GTACCATCAGAAAATGCTTAAGTCCATGGCTGCAACTCGTGGAGCGACTTGCGTCATTGGTACTACAAAGTTCTACGGTG +GCTGGGATTTCATGCTTAAAACATTGTACAAAGATGTTGATAATCCGCATCTTATGGGTTGGGATTACCCTAAGTGTGAT +AGAGCTATGCCTAATATGTGTAGAATCTTCGCTTCACTCATATTAGCTCGTAAACATGGCACTTGTTGTACTACAAGGGA +CAGATTTTATCGCTTGGCAAATGAGTGTGCTCAGGTGCTAAGCGAATATGTTCTATGTGGTGGTGGTTACTACGTCAAAC +CTGGAGGTACCAGTAGCGGAGATGCCACCACTGCATATGCCAATAGTGTCTTTAACATTTTGCAGGCGACAACTGCTAAT +GTCAGTGCACTTATGGGTGCTAATGGCAACAAGATTGTTGACAAAGAAGTTAAAGACATGCAGTTTGATTTGTATGTCAA +TGTTTACAGGAGCACTAGCCCAGACCCCAAATTTGTTGATAAATACTATGCTTTTCTTAATAAGCACTTTTCTATGATGA +TACTGTCTGATGACGGTGTCGTTTGCTATAATAGTGATTATGCAGCTAAGGGTTACATTGCTGGAATACAGAATTTTAAG +GAAACGCTGTATTATCAGAACAATGTCTTTATGTCTGAAGCTAAATGCTGGGTGGAAACCGATCTGAAGAAAGGGCCACA +TGAATTCTGTTCACAGCATACGCTTTATATTAAGGATGGCGACGATGGTTACTTCCTTCCTTATCCAGACCCTTCAAGAA +TTTTGTCTGCCGGTTGCTTTGTAGATGATATCGTTAAGACTGACGGTACACTCATGGTAGAGCGGTTTGTGTCTTTGGCT +ATAGATGCTTACCCTCTCACAAAGCATGAAGATATAGAATACCAGAATGTATTCTGGGTCTACTTACAGTATATAGAAAA +ACTGTATAAAGACCTTACAGGACACATGCTTGACAGTTATTCTGTCATGCTATGTGGTGATAATTCTGCTAAGTTTTGGG +AAGAGGCATTCTATAGAGATCTCTATAGTTCGCCTACCACTTTGCAGGCTGTCGGTTCATGCGTTGTATGCCATTCACAG +ACTTCCCTACGCTGTGGGACATGCATCCGTAGACCATTTCTCTGCTGTAAATGCTGCTATGATCATGTTATAGCAACTCC +ACATAAGATGGTTTTGTCTGTTTCTCCTTACGTTTGTAATGCCCCTGGTTGTGGCGTTTCAGACGTTACTAAGCTATATT +TAGGTGGTATGAGCTACTTTTGTGTAGATCATAGACCTGTGTGTAGTTTTCCACTTTGCGCTAATGGTCTTGTATTCGGC +TTATACAAGAATATGTGCACAGGTAGTCCTTCTATAGTTGAATTTAATAGGTTGGCTACCTGTGACTGGACTGAAAGTGG +TGATTACACCCTTGCCAATACTACAACAGAACCACTCAAACTTTTTGCTGCTGAGACTTTACGTGCCACTGAAGAGGCGT +CTAAGCAGTCTTATGCTATTGCCACCATCAAAGAAATTGTTGGTGAGCGCCAACTATTACTTGTGTGGGAGGCTGGCAAG +TCCAAACCACCACTCAATCGTAATTATGTTTTTACTGGTTATCATATAACCAAAAATAGTAAAGTGCAGCTCGGTGAGTA +CATTTTCGAGCGCATTGATTATAGTGATGCTGTATCCTACAAGTCTAGTACAACGTATAAACTGACTGTAGGTGACATCT +TCGTACTTACCTCTCACTCTGTGGCTACCTTGACGGCGCCCACAATTGTGAATCAAGAGAGGTATGTTAAAATTACTGGG +TTGTACCCAACCATTACGGTACCTGAAGAGTTCGCAAGTCATGTTGCCAACTTCCAAAAATCAGGTTATAGTAAATATGT +CACTGTTCAGGGACCACCTGGCACTGGCAAAAGTCATTTTGCTATAGGGTTAGCGATTTACTACCCTACAGCACGTGTTG +TTTATACAGCATGTTCACACGCAGCTGTTGATGCTTTGTGTGAAAAAGCTTTTAAATATTTGAACATTGCTAAATGTTCC +CGTATCATTCCTGCAAAGGCACGTGTTGAGTGCTATGACAGGTTTAAAGTTAATGAGACAAATTCTCAATATTTGTTTAG +TACTATTAATGCTCTACCAGAAACTTCTGCCGATATTCTGGTGGTTGATGAGGTTAGTATGTGCACTAATTATGATCTTT +CAATTATTAATGCACGTATTAAAGCTAAGCACATTGTCTATGTAGGAGATCCAGCACAGTTGCCAGCTCCTAGGACTTTG +TTGACTAGAGGCACATTGGAACCAGAAAATTTCAATAGTGTCACTAGATTGATGTGTAACTTAGGTCCTGACATATTTTT +AAGTATGTGCTACAGGTGTCCTAAGGAAATAGTAAGCACTGTGAGCGCTCTTGTCTACAATAATAAATTGTTAGCCAAGA +AGGAGCTTTCAGGCCAGTGCTTTAAAATACTCTATAAGGGCAATGTGACGCATGATGCTAGCTCTGCCATTAATAGACCA +CAACTCACATTTGTGAAGAATTTTATTACTGCCAATCCGGCATGGAGTAAGGCAGTCTTTATTTCGCCTTACAATTCACA +GAATGCTGTGTCTCGTTCAATGCTGGGTCTTACCACTCAGACTGTTGATTCCTCACAGGGTTCAGAATACCAGTACGTTA +TCTTCTGTCAAACAGCAGATACGGCACATGCTAACAACATTAACAGATTTAATGTTGCAATCACTCGTGCCCAAAAAGGT +ATTCTTTGTGTTATGACATCTCAGGCACTCTTTGAGTCCTTAGAGTTTACTGAATTGTCTTTTACTAATTACAAGCTCCA +GTCTCAGATTGTAACTGGCCTTTTTAAAGATTGCTCTAGAGAAACTTCTGGCCTCTCACCTGCTTATGCACCAACATATG +TTAGTGTTGATGACAAGTATAAGACGAGTGATGAGCTTTGCGTGAATCTTAATTTACCCGCAAATGTCCCATACTCTCGT +GTTATTTCCAGGATGGGCTTTAAACTCGATGCAACAGTTCCTGGATATCCTAAGCTTTTCATTACTCGTGAAGAGGCTGT +AAGGCAAGTTCGAAGCTGGATAGGCTTCGATGTTGAGGGTGCTCATGCTTCCCGTAATGCATGTGGCACCAATGTGCCTC +TACAATTAGGATTTTCAACTGGTGTGAACTTTGTTGTTCAGCCAGTTGGTGTTGTAGACACTGAGTGGGGTAACATGTTA +ACGGGCATTGCTGCACGTCCTCCACCAGGTGAACAGTTTAAGCACCTCGTGCCTCTTATGCATAAGGGGGCTGCGTGGCC +TATTGTTAGACGACGTATAGTGCAAATGTTGTCAGACACTTTAGACAAATTGTCTGATTACTGTACGTTTGTTTGTTGGG +CTCATGGCTTTGAATTAACGTCTGCATCATACTTTTGCAAGATAGGTAAGGAACAGAAGTGTTGCATGTGCAATAGACGC +GCTGCAGCGTACTCTTCACCTCTGCAATCTTATGCCTGCTGGACTCATTCCTGCGGTTATGATTATGTCTACAACCCTTT +CTTTGTCGATGTTCAACAGTGGGGTTATGTAGGCAATCTTGCTACTAATCACGATCGTTATTGCTCTGTCCATCAAGGAG +CTCATGTGGCTTCTAATGATGCAATAATGACTCGTTGTTTAGCTATTCATTCTTGTTTTATAGAACGTGTGGATTGGGAT +ATAGAGTATCCTTATATCTCACATGAAAAGAAATTGAATTCCTGTTGTAGAATCGTTGAGCGCAACGTCGTACGTGCTGC +TCTTCTTGCCGGTTCATTTGACAAAGTCTATGATATTGGCAATCCTAAAGGAATTCCTATTGTTGATGACCCTGTGGTTG +ATTGGCATTATTTTGATGCACAGCCCTTGACCAGGAAGGTACAACAGCTTTTCTATACAGAGGACATGGCCTCAAGATTT +GCTGATGGGCTCTGCTTATTTTGGAACTGTAATGTACCAAAATATCCTAATAATGCAATTGTATGCAGGTTTGACACACG +TGTGCATTCTGAGTTCAATTTGCCAGGTTGTGATGGCGGTAGTTTGTATGTTAACAAGCACGCTTTTCATACACCAGCAT +ATGATGTGAGTGCATTCCGTGATCTGAAACCTTTACCATTCTTTTATTATTCTACTACACCATGTGAAGTGCATGGTAAT +GGTAGTATGATAGAGGATATTGATTATGTACCCCTAAAATCTGCAGTCTGTATTACAGCTTGTAATTTAGGGGGCGCTGT +TTGTAGGAAGCATGCTACAGAGTACAGAGAGTATATGGAAGCATATAATCTTGTCTCTGCATCAGGTTTCCGCCTTTGGT +GTTATAAGACCTTTGATATTTATAATCTCTGGTCTACTTTTACAAAAGTTCAAGGTTTGGAAAACATTGCTTTTAATGTT +GTTAAACAAGGCCATTTTATTGGTGTTGAGGGTGAACTACCTGTAGCTGTAGTCAATGATAAGATCTTCACCAAGAGTGG +CGTTAATGACATTTGTATGTTTGAGAATAAAACCACTTTGCCTACTAATATAGCTTTTGAACTCTATGCTAAGCGTGCTG +TACGCTCGCATCCCGATTTCAAATTGCTACACAATTTACAAGCAGACATTTGCTACAAGTTCGTCCTTTGGGATTATGAA +CGTAGCAATATTTATGGTACTGCTACTATTGGTGTATGTAAGTACACTGATATTGATGTTAATTCAGCTTTGAATATATG +TTTTGACATACGCGATAATTGTTCATTGGAGAAGTTCATGTCTACTCCCAATGCCATCTTTATTTCTGATAGAAAAATCA +AGAAATACCCTTGTATGGTAGGTCCTGATTATGCTTACTTCAATGGTGCTATCATCCGTGATAGTGATGTTGTTAAACAA +CCAGTGAAGTTCTACTTGTATAAGAAAGTCAATAATGAGTTTATTGATCCTACTGAGTGTATTTACACTCAGAGTCGCTC +TTGTAGTGACTTCCTACCCCTTTCTGACATGGAGAAAGACTTTCTATCTTTTGATAGTGATGTTTTCATTAAGAAGTATG +GCTTGGAAAACTATGCTTTTGAGCACGTAGTCTATGGAGACTTCTCTCATACTACGTTAGGCGGTCTTCACTTGCTTATT +GGTTTATACAAGAAGCAACAGGAAGGTCATATTATTATGGAAGAAATGCTAAAAGGTAGCTCAACTATTCATAACTATTT +TATTACTGAGACTAACACAGCGGCTTTTAAGGCGGTGTGTTCTGTTATAGATTTAAAGCTTGACGACTTTGTTATGATTT +TAAAGAGTCAAGACCTTGGCGTAGTATCCAAGGTTGTCAAGGTTCCTATTGACTTAACAATGATTGAGTTTATGTTATGG +TGTAAGGATGGACAGGTTCAAACCTTCTACCCTCGACTCCAGGCTTCTGCAGATTGGAAACCTGGTCATGCAATGCCATC +CCTCTTTAAAGTTCAAAATGTAAACCTTGAACGTTGTGAGCTTGCTAATTACAAGCAATCTATTCCTATGCCTCGCGGTG +TGCACATGAACATCGCTAAATATATGCAATTGTGCCAGTATTTAAATACTTGCACATTAGCCGTGCCTGCCAATATGCGT +GTTATACATTTTGGCGCTGGTTCTGATAAAGGTATCGCTCCTGGTACCTCAGTTTTACGACAGTGGCTTCCTACAGATGC +CATTATTATAGATAATGATTTAAATGAGTTCGTGTCAGATGCTGACATAACTTTATTTGGAGATTGTGTAACTGTACGTG +TCGGCCAACAAGTGGATCTTGTTATTTCCGACATGTATGATCCTACTACTAAGAATGTAACAGGTAGTAATGAGTCAAAG +GCTTTATTCTTTACTTACCTGTGTAACCTCATTAATAATAATCTTGCTCTTGGTGGGTCTGTTGCTATTAAAATAACAGA +ACACTCTTGGAGCGTTGAACTTTATGAACTTATGGGAAAATTTGCTTGGTGGACTGTTTTCTGCACCAATGCAAATGCAT +CCTCATCTGAAGGATTCCTCTTAGGTATTAATTACTTGGGTACTATTAAAGAAAATATAGATGGTGGTGCTATGCACGCC +AACTATATATTTTGGAGAAATTCCACTCCTATGAATCTGAGTACTTACTCACTTTTTGATTTATCCAAGTTTCAATTAAA +ATTAAAAGGAACACCAGTTCTTCAATTAAAGGAGAGTCAAATTAACGAACTCGTAATATCTCTCCTGTCGCAGGGTAAGT +TACTTATCCGTGACAATGATACACTCAGTGTTTCTACTGATGTTCTTGTTAACACCTACAGAAAGTTACGTTGATGTAGG +GCCAGATTCTGTTAAGTCTGCTTGTATTGAGGTTGATATACAACAGACTTTCTTTGATAAAACTTGGCCTAGGCCAATTG +ATGTTTCTAAGGCTGACGGTATTATATACCCTCAAGGCCGTACATATTCTAACATAACTATCACTTATCAAGGTCTTTTT +CCCTATCAGGGAGACCATGGTGATATGTATGTTTACTCTGCAGGACATGCTACAGGCACAACTCCACAAAAGTTGTTTGT +AGCTAACTATTCTCAGGACGTCAAACAGTTTGCTAATGGGTTTGTCGTCCGTATAGGAGCAGCTGCCAATTCCACTGGCA +CTGTTATTATTAGCCCATCTACCAGCGCTACTATACGAAAAATTTACCCTGCTTTTATGCTGGGTTCTTCAGTTGGTAAT +TTCTCAGATGGTAAAATGGGCCGCTTCTTCAATCATACTCTAGTTCTTTTGCCCGATGGATGTGGCACTTTACTTAGAGC +TTTTTATTGTATTCTAGAGCCTCGCTCTGGAAATCATTGTCCTGCTGGCAATTCCTATACTTCTTTTGCCACTTATCACA +CTCCTGCAACAGATTGTTCTGATGGCAATTACAATCGTAATGCCAGTCTGAACTCTTTTAAGGAGTATTTTAATTTACGT +AACTGCACCTTTATGTACACTTATAACATTACCGAAGATGAGATTTTAGAGTGGTTTGGCATTACACAAACTGCTCAAGG +TGTTCACCTCTTCTCATCTCGGTATGTTGATTTGTACGGCGGCAATATGTTTCAATTTGCCACCTTGCCTGTTTATGATA +CTATTAAGTATTATTCTATCATTCCTCACAGTATTCGTTCTATCCAAAGTGATAGAAAAGCTTGGGCTGCCTTCTACGTA +TATAAACTTCAACCGTTAACTTTCCTGTTGGATTTTTCTGTTGATGGTTATATACGCAGAGCTATAGACTGTGGTTTTAA +TGATTTGTCACAACTCCACTGCTCATATGAATCCTTCGATGTTGAATCTGGAGTTTATTCAGTTTCGTCTTTCGAAGCAA +AACCTTCTGGCTCAGTTGTGGAACAGGCTGAAGGTGTTGAATGTGATTTTTCACCTCTTCTGTCTGGCACACCTCCTCAG +GTTTATAATTTCAAGCGTTTGGTTTTTACCAATTGCAATTATAATCTTACCAAATTGCTTTCACTTTTTTCTGTGAATGA +TTTTACTTGTAGTCAAATATCTCCAGCAGCAATTGCTAGCAACTGTTATTCTTCACTGATTTTGGATTACTTTTCATACC +CACTTAGTATGAAATCCGATCTCAGTGTTAGTTCTGCTGGTCCAATATCCCAGTTTAATTATAAACAGTCCTTTTCTAAT +CCCACATGTTTGATTTTAGCGACTGTTCCTCATAACCTTACTACTATTACTAAGCCTCTTAAGTACAGCTATATTAACAA +GTGCTCTCGTCTTCTTTCTGATGATCGTACTGAAGTACCTCAGTTAGTGAACGCTAATCAATACTCACCCTGTGTATCCA +TTGTCCCATCCACTGTGTGGGAAGACGGTGATTATTATAGGAAACAACTATCTCCACTTGAAGGTGGTGGCTGGCTTGTT +GCTAGTGGCTCAACTGTTGCCATGACTGAGCAATTACAGATGGGCTTTGGTATTACAGTTCAATATGGTACAGACACCAA +TAGTGTTTGCCCCAAGCTTGAATTTGCTAATGACACAAAAATTGCCTCTCAATTAGGCAATTGCGTGGAATATTCCCTCT +ATGGTGTTTCGGGCCGTGGTGTTTTTCAGAATTGCACAGCTGTAGGTGTTCGACAGCAGCGCTTTGTTTATGATGCGTAC +CAGAATTTAGTTGGCTATTATTCTGATGATGGCAACTACTACTGTTTGCGTGCTTGTGTTAGTGTTCCTGTTTCTGTCAT +CTATGATAAAGAAACTAAAACCCACGCTACTCTATTTGGTAGTGTTGCATGTGAACACATTTCTTCTACCATGTCTCAAT +ACTCCCGTTCTACGCGATCAATGCTTAAACGGCGAGATTCTACATATGGCCCCCTTCAGACACCTGTTGGTTGTGTCCTA +GGACTTGTTAATTCCTCTTTGTTCGTAGAGGACTGCAAGTTGCCTCTTGGTCAATCTCTCTGTGCTCTTCCTGACACACC +TAGTACTCTCACACCTCGCAGTGTGCGCTCTGTTCCAGGTGAAATGCGCTTGGCATCCATTGCTTTTAATCATCCTATTC +AGGTTGATCAACTTAATAGTAGTTATTTTAAATTAAGTATACCCACTAATTTTTCCTTTGGTGTGACTCAGGAGTACATT +CAGACAACCATTCAGAAAGTTACTGTTGATTGTAAACAGTACGTTTGCAATGGTTTCCAGAAGTGTGAGCAATTACTGCG +CGAGTATGGCCAGTTTTGTTCCAAAATAAACCAGGCTCTCCATGGTGCCAATTTACGCCAGGATGATTCTGTACGTAATT +TGTTTGCGAGCGTGAAAAGCTCTCAATCATCTCCTATCATACCAGGTTTTGGAGGTGACTTTAATTTGACACTTCTAGAA +CCTGTTTCTATATCTACTGGCAGTCGTAGTGCACGTAGTGCTATTGAGGATTTGCTATTTGACAAAGTCACTATAGCTGA +TCCTGGTTATATGCAAGGTTACGATGATTGCATGCAGCAAGGTCCAGCATCAGCTCGTGATCTTATTTGTGCTCAATATG +TGGCTGGTTACAAAGTATTACCTCCTCTTATGGATGTTAATATGGAAGCCGCGTATACTTCATCTTTGCTTGGCAGCATA +GCAGGTGTTGGCTGGACTGCTGGCTTATCCTCCTTTGCTGCTATTCCATTTGCACAGAGTATCTTTTATAGGTTAAACGG +TGTTGGCATTACTCAACAGGTTCTTTCAGAGAACCAAAAGCTTATTGCCAATAAGTTTAATCAGGCTCTGGGAGCTATGC +AAACAGGCTTCACTACAACTAATGAAGCTTTTCAGAAGGTTCAGGATGCTGTGAACAACAATGCACAGGCTCTATCCAAA +TTAGCTAGCGAGCTATCTAATACTTTTGGTGCTATTTCCGCCTCTATTGGAGACATCATACAACGTCTTGATGTTCTCGA +ACAGGACGCCCAAATAGACAGACTTATTAATGGCCGTTTGACAACACTAAATGCTTTTGTTGCACAGCAGCTTGTTCGTT +CCGAATCAGCTGCTCTTTCCGCTCAATTGGCTAAAGATAAAGTCAATGAGTGTGTCAAGGCACAATCCAAGCGTTCTGGA +TTTTGCGGTCAAGGCACACATATAGTGTCCTTTGTTGTAAATGCCCCTAATGGCCTTTACTTCATGCATGTTGGTTATTA +CCCTAGCAACCACATTGAGGTTGTTTCTGCTTATGGTCTTTGCGATGCAGCTAACCCTACTAATTGTATAGCCCCTGTTA +ATGGCTACTTTATTAAAACTAATAACACTAGGATTGTTGATGAGTGGTCATATACTGGCTCGTCCTTCTATGCACCTGAG +CCCATTACCTCCCTTAATACTAAGTATGTTGCACCACAGGTGACATACCAAAACATTTCTACTAACCTCCCTCCTCCTCT +TCTCGGCAATTCCACCGGGATTGACTTCCAAGATGAGTTGGATGAGTTTTTCAAAAATGTTAGCACCAGTATACCTAATT +TTGGTTCCCTAACACAGATTAATACTACATTACTCGATCTTACCTACGAGATGTTGTCTCTTCAACAAGTTGTTAAAGCC +CTTAATGAGTCTTACATAGACCTTAAAGAGCTTGGCAATTATACTTATTACAACAAATGGCCGTGGTACATTTGGCTTGG +TTTCATTGCTGGGCTTGTTGCCTTAGCTCTATGCGTCTTCTTCATACTGTGCTGCACTGGTTGTGGCACAAACTGTATGG +GAAAACTTAAGTGTAATCGTTGTTGTGATAGATACGAGGAATACGACCTCGAGCCGCATAAGGTTCATGTTCACTAATTA +ACGAACTATTAATGAGAGTTCAAAGACCACCCACTCTCTTGTTAGTGTTTTCACTCTCTCTTTTGGTCACTGCATCCTCA +AAACCTCTCTATGTACCTGAGCATTGTCAGAATTATTCTGGTTGCATGCTTAGGGCTTGTATTAAAACTGCCCAAGCTGA +TACAGCTGGTCTTTATACAAATTTTCGAATTGACGTCCCATCTGCAGAATCAACTGGTACTCAATCAGTTTCTGTCGATC +TTGAGTCAACTTCAACTCATGATGGTCCTACCGAACATGTTACTAGTGTGAATCTTTTTGACGTTGGTTACTCAGTTAAT +TAACGAACTCTATGGATTACGTGTCTCTGCTTAATCAAATTTGGCAGAAGTACCTTAACTCACCGTATACTACTTGTTTG +TACATCCCTAAACCCACAGCTAAGTATACACCTTTAGTTGGCACTTCATTGCACCCTGTGCTGTGGAACTGTCAGCTATC +CTTTGCTGGTTATACTGAATCTGCTGTTAATTCTACAAAAGCTTTGGCCAAACAGGACGCAGCTCAGCGAATCGCTTGGT +TGCTACATAAGGATGGAGGAATCCCTGATGGATGTTCCCTCTACCTCCGGCACTCAAGTTTATTCGCGCAAAGCGAGGAA +GAGGAGCCATTCTCCAACTAAGAAACTGCGCTACGTTAAGCGTAGATTTTCTCTTCTGCGCCATGAAGACCTTAGTGTTA +TTGTCCAACCAACACACTATGTCAGGGTTACATTTTCAGACCCCAACATGTGGTATCTACGTTCGGGTCATCATTTACAC +TCAGTTCACAATTGGCTTAAACCTTATGGCGGCCAACCTGTTTCTGAGTACCATATTACTCTAGCTTTGCTAAATCTCAC +TGATGAAGATTTAGCTAGAGATTTTTCACCCATTGCGCTCTTTTTGCGCAATGTCAGATTTGAGCTACATGAGTTCGCCT +TGCTGCGCAAAACTCTTGTTCTTAATGCATCAGAGATCTACTGTGCTAACATACATAGATTTAAGCCTGTGTATAGAGTT +AACACGGCAATCCCTACTATTAAGGATTGGCTTCTCGTTCAGGGATTTTCCCTTTACCATAGTGGCCTCCCTTTACATAT +GTCAATCTCTAAATTGCATGCACTGGATGATGTTACTCGCAATTACATCATTACAATGCCATGCTTTAGAACTTACCCTC +AACAAATGTTTGTTACTCCTTTGGCCGTAGATGTTGTCTCCATACGGTCTTCCAATCAGGGTAATAAACAAATTGTTCAT +TCTTATCCCATTTTACATCATCCAGGATTTTAACGAACTATGGCTTTCTCGGCGTCTTTATTTAAACCCGTCCAGCTAGT +CCCAGTTTCTCCTGCATTTCATCGCATTGAGTCTACTGACTCTATTGTTTTCACATACATTCCTGCTAGCGGCTATGTAG +CTGCTTTAGCTGTCAATGTGTGTCTCATTCCCCTATTATTACTGCTACGTCAAGATACTTGTCGTCGCAGCATTATCAGA +ACTATGGTTCTCTATTTCCTTGTTCTGTATAACTTTTTATTAGCCATTGTACTAGTCAATGGTGTACATTATCCAACTGG +AAGTTGCCTGATAGCCTTCTTAGTTATCCTCATAATACTTTGGTTTGTAGATAGAATTCGTTTCTGTCTCATGCTGAATT +CCTACATTCCACTGTTTGACATGCGTTCCCACTTTATTCGTGTTAGTACAGTTTCTTCTCATGGTATGGTCCCTGTAATA +CACACCAAACCATTATTTATTAGAAACTTCGATCAGCGTTGCAGCTGTTCTCGTTGTTTTTATTTGCACTCTTCCACTTA +TATAGAGTGCACTTATATTAGCCGTTTTAGTAAGATTAGCCTAGTTTCTGTAACTGACTTCTCCTTAAACGGCAATGTTT +CCACTGTTTTCGTGCCTGCAACGCGCGATTCAGTTCCTCTTCACATAATCGCCCCGAGCTCGCTTATCGTTTAAGCAGCT +CTGCGCTACTATGGGTCCCGTGTAGAGGCTAATCCATTAGTCTCTCTTTGGACATATGGAAAACGAACTATGTTACCCTT +TGTCCAAGAACGAATAGGGTTGTTCATAGTAAACTTTTTCATTTTTACCGTAGTATGTGCTATAACACTCTTGGTGTGTA +TGGCTTTCCTTACGGCTACTAGATTATGTGTGCAATGTATGACAGGCTTCAATACCCTGTTAGTTCAGCCCGCATTATAC +TTGTATAATACTGGACGTTCAGTCTATGTAAAATTCCAGGATAGTAAACCCCCTCTACCACCTGACGAGTGGGTTTAACG +AACTCCTTCATAATGTCTAATATGACGCAACTCACTGAGGCGCAGATTATTGCCATTATTAAAGACTGGAACTTTGCATG +GTCCCTGATCTTTCTCTTAATTACTATCGTACTACAGTATGGATACCCATCCCGTAGTATGACTGTCTATGTCTTTAAAA +TGTTTGTTTTATGGCTCCTATGGCCATCTTCCATGGCGCTATCAATATTTAGCGCCGTTTATCCAATTGATCTAGCTTCC +CAGATAATCTCTGGCATTGTAGCAGCTGTTTCAGCTATGATGTGGATTTCCTACTTTGTGCAGAGTATCCGGCTGTTTAT +GAGAACTGGATCATGGTGGTCATTCAATCCTGAGACTAATTGCCTTTTGAACGTTCCATTTGGTGGTACAACTGTCGTAC +GTCCACTCGTAGAGGACTCTACCAGTGTAACTGCTGTTGTAACCAATGGCCACCTCAAAATGGCTGGCATGCATTTCGGT +GCTTGTGACTACGACAGACTTCCTAATGAAGTCACCGTGGCCAAACCCAATGTGCTGATTGCTTTAAAAATGGTGAAGCG +GCAAAGCTACGGAACTAATTCCGGCGTTGCCATTTACCATAGATATAAGGCAGGTAATTACAGGAGTCCGCCTATTACGG +CGGATATTGAACTTGCATTGCTTCGAGCTTAGGCTCTTTAGTAAGAGTATCTTAATTGATTTTAACGAATCTCAATTTCA +TTGTTATGGCATCCCCTGCTGCACCTCGTGCTGTTTCCTTTGCCGATAACAATGATATAACAAATACAAACCTATCTCGA +GGTAGAGGACGTAATCCAAAACCACGAGCTGCACCAAATAACACTGTCTCTTGGTACACTGGGCTTACCCAACACGGGAA +AGTCCCTCTTACCTTTCCACCTGGGCAGGGTGTACCTCTTAATGCCAATTCTACCCCTGCGCAAAATGCTGGGTATTGGC +GGAGACAGGACAGAAAAATTAATACCGGGAATGGAATTAAGCAACTGGCTCCCAGGTGGTACTTCTACTACACTGGAACT +GGACCCGAAGCAGCACTCCCATTCCGGGCTGTTAAGGATGGCATCGTTTGGGTCCATGAAGATGGCGCCACTGATGCTCC +TTCAACTTTTGGGACGCGGAACCCTAACAATGATTCAGCTATTGTTACACAATTCGCGCCCGGTACTAAGCTTCCTAAAA +ACTTCCACATTGAGGGGACTGGAGGCAATAGTCAATCATCTTCAAGAGCCTCTAGCTTAAGCAGAAACTCTTCCAGATCT +AGTTCACAAGGTTCAAGATCAGGAAACTCTACCCGCGGCACTTCTCCAGGTCCATCTGGAATCGGAGCAGTAGGAGGTGA +TCTACTTTACCTTGATCTTCTGAACAGACTACAAGCCCTTGAGTCTGGCAAAGTAAAGCAATCGCAGCCAAAAGTAATCA +CTAAGAAAGATGCTGCTGCTGCTAAAAATAAGATGCGCCACAAGCGCACTTCCACCAAAAGTTTCAACATGGTGCAAGCT +TTTGGTCTTCGCGGACCAGGAGACCTCCAGGGAAACTTTGGTGATCTTCAATTGAATAAACTCGGCACTGAGGACCCACG +TTGGCCCCAAATTGCTGAGCTTGCTCCTACAGCCAGTGCTTTTATGGGTATGTCGCAATTTAAACTTACCCATCAGAACA +ATGATGATCATGGCAACCCTGTGTACTTCCTTCGGTACAGTGGAGCCATTAAACTTGACCCAAAGAATCCCAACTACAAT +AAGTGGTTGGAGCTTCTTGAGCAAAATATTGATGCCTACAAAACCTTCCCTAAGAAGGAAAAGAAACAAAAGGCACCAAA +AGAAGAATCAACAGACCAAATGTCTGAACCTCCAAAGGAGCAGCGTGTGCAAGGTAGCATCACTCAGCGCACTCGCACCC +GTCCAAGTGTTCAGCCTGGTCCAATGATTGATGTTAACACTGATTAGTGTCACTCAAAGTAACAAGATCGCGGCAATCGT +TTGTGTTTGGCAACCCCATCTCACCATCGCTTGTCCACTCTTGCACAGAATGGAATCATGTTGTAATTACAGTGCAATAA +GGTAATTATAACCCATTTAATTGATAGCTATGCTTTATTAAAGTGTGTAGCTGTAGAGAGAATGTTAAAGACTGTCACCT +CTGCTTGATTGCAAGTGAACAGTGCCCCCCGGGAAGAGCTCTACAGTGTGAAATGTAAATAAAAAATAGCTATTATTCAA +TTAGATTAGGCTAATTAGATGATTTGCAAAAAAAAAAAA diff --git a/data/library/viral/assembly_summary_refseq.txt b/data/library/viral/assembly_summary_refseq.txt new file mode 100644 index 0000000..f1fdaef --- /dev/null +++ b/data/library/viral/assembly_summary_refseq.txt @@ -0,0 +1,3 @@ +GCF_000864765.1 PRJNA485481 na na reference genome 11676 11676 Human immunodeficiency virus 1 na na latest Complete Genome Major Full 2000/08/01 ViralProj15476 NIH, NLM GCA_000864765.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/864/765/GCF_000864765.1_ViralProj15476 na ICTV species exemplar na haploid viral 9181 9181 42.000000 1 1 1NCBI RefSeq Annotation submitted by NCBI RefSeq 2018/08/13 10 10 0 9362478 +GCF_000865725.1 PRJNA485481 na na na 211044 2955291 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) strain=A/Puerto Rico/8/1934 na latest Complete Genome Major Full 2000/06/12 ViralMultiSegProj15521 NCBI GCA_000865725.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/865/725/GCF_000865725.1_ViralMultiSegProj15521 na ICTV species exemplar na haploid viral 1358813588 43.500000 8 8 8 NCBI RefSeq Annotation submitted by NCBI RefSeq 2018/08/13 12 12 0 7010182;7060132;7278968;6281731;7208353;6927841;7465426;7292985 +GCF_009858895.2 PRJNA485481 na na reference genome 2697049 694009 Severe acute respiratory syndrome coronavirus 2 na Wuhan-Hu-1 latest Complete Genome Major Full 2020/01/17 ASM985889v3 Shanghai Public Health Clinical Center & School of Public Health, Fudan University GCA_009858895.3 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3 na ICTV additional isolate na haploid viral 29903 29903 38.000000 1 1 1 NCBI RefSeq Annotation submitted by NCBI RefSeq 2020/07/18 11 11 0 32015508 diff --git a/data/library/viral/refseq/GCF_000864765.1_ViralProj15476_genomic.fna.gz b/data/library/viral/refseq/GCF_000864765.1_ViralProj15476_genomic.fna.gz new file mode 100644 index 0000000..9b21b44 Binary files /dev/null and b/data/library/viral/refseq/GCF_000864765.1_ViralProj15476_genomic.fna.gz differ diff --git a/data/library/viral/refseq/GCF_000865725.1_ViralMultiSegProj15521_genomic.fna.gz b/data/library/viral/refseq/GCF_000865725.1_ViralMultiSegProj15521_genomic.fna.gz new file mode 100644 index 0000000..5f60e46 Binary files /dev/null and b/data/library/viral/refseq/GCF_000865725.1_ViralMultiSegProj15521_genomic.fna.gz differ diff --git a/data/library/viral/refseq/GCF_009858895.2_ASM985889v3_genomic.fna.gz b/data/library/viral/refseq/GCF_009858895.2_ASM985889v3_genomic.fna.gz new file mode 100644 index 0000000..73a8943 Binary files /dev/null and b/data/library/viral/refseq/GCF_009858895.2_ASM985889v3_genomic.fna.gz differ diff --git a/data/taxonomy/names.dmp b/data/taxonomy/names.dmp new file mode 100644 index 0000000..285998a --- /dev/null +++ b/data/taxonomy/names.dmp @@ -0,0 +1,39 @@ +1 | root | | scientific name | +10239 | Viruses | | scientific name | +10699 | Siphoviridae | | scientific name | +10710 | Escherichia virus Lambda | | scientific name | +11118 | Coronaviridae | | scientific name | +11308 | Orthomyxoviridae | | scientific name | +11320 | Influenza A virus | | scientific name | +11520 | Influenza B virus | | scientific name | +11632 | Retroviridae | | scientific name | +11646 | Lentivirus | | scientific name | +11676 | Human immunodeficiency virus 1 | | scientific name | +11709 | Human immunodeficiency virus 2 | | scientific name | +28883 | Caudovirales | | scientific name | +76804 | Nidovirales | | scientific name | +114727 | H1N1 subtype | | scientific name | +114729 | H2N2 subtype | | scientific name | +119210 | H3N2 subtype | | scientific name | +186765 | Lambdavirus | | scientific name | +197911 | Alphainfluenzavirus | | scientific name | +197912 | Betainfluenzavirus | | scientific name | +211044 | Influenza A virus (A/Puerto Rico/8/1934(H1N1)) | | scientific name | +327045 | Orthoretrovirinae | | scientific name | +335341 | Influenza A virus (A/New York/392/2004(H3N2)) | | scientific name | +488241 | Influenza A virus (A/Korea/426/1968(H2N2)) | | scientific name | +518987 | Influenza B virus (B/Lee/1940) | | scientific name | +694002 | Betacoronavirus | | scientific name | +694009 | Severe acute respiratory syndrome-related coronavirus | | scientific name | +1335626 | Middle East respiratory syndrome-related coronavirus | | scientific name | +2169561 | Ortervirales | | scientific name | +2497569 | Negarnaviricota | | scientific name | +2497571 | Polyploviricotina | | scientific name | +2497577 | Insthoviricetes | | scientific name | +2499399 | Cornidovirineae | | scientific name | +2499411 | Articulavirales | | scientific name | +2501931 | Orthocoronavirinae | | scientific name | +2509494 | Merbecovirus | | scientific name | +2509511 | Sarbecovirus | | scientific name | +2559587 | Riboviria | | scientific name | +2697049 | Severe acute respiratory syndrome coronavirus 2 | | scientific name | diff --git a/data/taxonomy/nodes.dmp b/data/taxonomy/nodes.dmp new file mode 100644 index 0000000..aff7540 --- /dev/null +++ b/data/taxonomy/nodes.dmp @@ -0,0 +1,39 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +10239 | 1 | superkingdom | | 9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +10699 | 28883 | family | | 3 | 0 | 11 | 1 | 0 | 1 | 0 | 0 | | +10710 | 186765 | species | EV | 3 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +11118 | 2499399 | family | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +11308 | 2499411 | family | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +11320 | 197911 | species | IA | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +11520 | 197912 | species | IB | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +11632 | 2169561 | family | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +11646 | 327045 | genus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +11676 | 11646 | species | HI | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +11709 | 11646 | species | HI | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +28883 | 10239 | order | | 3 | 0 | 11 | 0 | 0 | 1 | 0 | 0 | | +76804 | 2559587 | order | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +114727 | 11320 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +114729 | 11320 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +119210 | 11320 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +186765 | 10699 | genus | | 3 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +197911 | 11308 | genus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +197912 | 11308 | genus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +211044 | 114727 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +327045 | 11632 | subfamily | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +335341 | 119210 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +488241 | 114729 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +518987 | 11520 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +694002 | 2501931 | genus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +694009 | 2509511 | species | SA | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +1335626 | 2509494 | species | ME | 9 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +2169561 | 10239 | order | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2497569 | 2559587 | phylum | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2497571 | 2497569 | subphylum | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2497577 | 2497571 | class | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2499399 | 76804 | suborder | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2499411 | 2497577 | order | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2501931 | 11118 | subfamily | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2509494 | 694002 | subgenus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2509511 | 694002 | subgenus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2559587 | 10239 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | +2697049 | 694009 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | | diff --git a/kr2r.sh b/kr2r.sh index 35b2ad8..bda6b88 100644 --- a/kr2r.sh +++ b/kr2r.sh @@ -1,24 +1,18 @@ - - DIR=`dirname $(realpath $0 || echo $0)` +DOWNLOADS="" DATABASE="" -DATABASE_CHUNK="" +CHUNK_DIR="" -# 1. 下载 bacteria,viral 原始fna.gz格式文件和md5文件 -${DIR}/ncbi --db $DATABASE gen -g bacteria,viral -# 1.1 校验md5文件 -${DIR}/ncbi --db $DATABASE gen -g bacteria,viral md5 - -# 1.2 下载taxonomy文件 -${DIR}/ncbi --db $DATABASE taxonomy +# 1. 下载 bacteria,viral 原始fna.gz格式文件和md5文件 +${DIR}/ncbi -d $DOWNLOADS gen -g bacteria,viral -# 2. 生成library.fna和prelim_map.txt子文件 -${DIR}/ncbi --db $DATABASE gen -g bacteria,viral fna +# 2. 下载taxonomy文件 +${DIR}/ncbi -d $DATABASE taxonomy -# 3. 预估数据库大小 -# ${DIR}/Kun estimate_capacity --db $DATABASE -k 35 -l 31 +# 3. build +${DIR}/kun_peng build -d $DATABASE --db $DATABASE -# 4. build -${DIR}/kun_peng build --db $DATABASE --chunk-dir ${DATABASE_CHUNK} +# 4. classify +./target/release/kun_peng classify --db $DATABASE --chunk-dir $CHUNK_DIR $the_sample_files diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml index 3d958ac..5826ace 100644 --- a/kr2r/Cargo.toml +++ b/kr2r/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kr2r" -version = "0.5.0" +version = "0.5.1" edition = "2021" authors = ["eric9n@gmail.com"] @@ -11,15 +11,12 @@ name = "kun_peng" path = "src/bin/kun.rs" [features] -default = ["dna"] -dna = [] -protein = [] double_hashing = [] exact_counting = [] [dependencies] +seqkmer = { version = "0.1.0", path = "../seqkmer" } clap = { version = "4.4.10", features = ["derive"] } -seq_io = "0.3.2" hyperloglogplus = { version = "*", features = ["const-loop"] } seahash = "4.1.0" serde = { version = "1.0", features = ["derive"] } @@ -31,17 +28,10 @@ libc = "0.2" regex = "1.5.4" flate2 = "1.0" dashmap = { version = "5.5.3", features = ["rayon"] } +num_cpus = "1.13.1" [dev-dependencies] criterion = "0.5.1" twox-hash = "1.6.3" farmhash = {version = "1.1.5"} -[[bench]] -name = "mmscanner_benchmark" -harness = false - - -[[bench]] -name = "hash_benchmark" -harness = false diff --git a/kr2r/benches/hash_benchmark.rs b/kr2r/benches/hash_benchmark.rs deleted file mode 100644 index cba911c..0000000 --- a/kr2r/benches/hash_benchmark.rs +++ /dev/null @@ -1,32 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use kr2r::{fmix64, murmur_hash3, sea_hash}; -use std::hash::Hasher; -use twox_hash::xxh3; -extern crate farmhash; - -#[inline] -pub fn xx_hash(key: u64) -> u64 { - let mut xhash = xxh3::Hash64::default(); - xhash.write_u64(key); - xhash.finish() -} - -#[inline] -pub fn farm(key: u64) -> u64 { - // let bytes = key.to_be_bytes(); - // let byte_slce: &[u8] = &bytes; - farmhash::hash64(&key.to_be_bytes()) -} - -fn criterion_benchmark(c: &mut Criterion) { - let key = 0x12345678abcdef01u64; - - c.bench_function("fmix64", |b| b.iter(|| fmix64(black_box(key)))); - c.bench_function("murmur_hash3", |b| b.iter(|| murmur_hash3(black_box(key)))); - c.bench_function("sea_hash", |b| b.iter(|| sea_hash(black_box(key)))); - c.bench_function("xx_hash", |b| b.iter(|| xx_hash(black_box(key)))); - c.bench_function("farm", |b| b.iter(|| farm(black_box(key)))); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/kr2r/benches/mmscanner_benchmark.rs b/kr2r/benches/mmscanner_benchmark.rs deleted file mode 100644 index d728c15..0000000 --- a/kr2r/benches/mmscanner_benchmark.rs +++ /dev/null @@ -1,20 +0,0 @@ -use criterion::{criterion_group, criterion_main, Criterion}; -use kr2r::mmscanner::MinimizerScanner; -use kr2r::Meros; -// 定义性能测试函数 -fn performance_test(c: &mut Criterion) { - let seq: Vec = b"ACGATCGACGACG".to_vec(); - let meros = Meros::new(10, 5, None, None, None); - let mut scanner = MinimizerScanner::new(&seq, meros); - // 这里执行需要测试性能的操作,例如多次调用 next_minimizer - c.bench_function("next", |b| { - b.iter(|| { - let _ = scanner.next(); - // let _ = scanner.next_minimizer(&seq); - }); - }); -} - -// 创建性能测试组 -criterion_group!(benches, performance_test); -criterion_main!(benches); diff --git a/kr2r/examples/build_and_classify.rs b/kr2r/examples/build_and_classify.rs new file mode 100644 index 0000000..b3cbd13 --- /dev/null +++ b/kr2r/examples/build_and_classify.rs @@ -0,0 +1,82 @@ +use std::fs; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + // Define the paths and directories + let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .to_path_buf(); + let kr2r_binary = workspace_root.join("target/release/kun_peng"); + let data_dir = workspace_root.join("data"); + let test_dir = workspace_root.join("test_database"); + + // Ensure the necessary directories exist + fs::create_dir_all(&data_dir).expect("Failed to create download directory"); + fs::create_dir_all(&test_dir).expect("Failed to create database directory"); + + // Command 1: ./target/release/kun_peng build --download-dir data/ --db test_database + let build_args = vec![ + "build".to_string(), + "--download-dir".to_string(), + data_dir.to_string_lossy().to_string(), + "--db".to_string(), + test_dir.to_string_lossy().to_string(), + ]; + + let build_command_str = format!("{} {}", kr2r_binary.to_string_lossy(), build_args.join(" ")); + println!("Executing command: {}", build_command_str); + + let build_output = Command::new(&kr2r_binary) + .args(&build_args) + .output() + .expect("Failed to run kun_peng build command"); + println!( + "kun_peng build output: {}", + String::from_utf8_lossy(&build_output.stdout) + ); + if !build_output.stderr.is_empty() { + println!( + "kun_peng build error: {}", + String::from_utf8_lossy(&build_output.stderr) + ); + } + + // Command 2: ./target/release/kun_peng direct --db test_database data/COVID_19.fa + let covid_fa = data_dir.join("COVID_19.fa"); + if !covid_fa.exists() { + println!( + "kun_peng error: fasta file {} does not exists", + covid_fa.to_string_lossy().to_string() + ); + } + let direct_args = vec![ + "direct".to_string(), + "--db".to_string(), + test_dir.to_string_lossy().to_string(), + covid_fa.to_string_lossy().to_string(), + ]; + + let direct_command_str = format!( + "{} {}", + kr2r_binary.to_string_lossy(), + direct_args.join(" ") + ); + println!("Executing command: {}", direct_command_str); + + let direct_output = Command::new(&kr2r_binary) + .args(&direct_args) + .output() + .expect("Failed to run kun_peng direct command"); + println!( + "kun_peng direct output: {}", + String::from_utf8_lossy(&direct_output.stdout) + ); + if !direct_output.stderr.is_empty() { + println!( + "kun_peng direct error: {}", + String::from_utf8_lossy(&direct_output.stderr) + ); + } +} diff --git a/kr2r/src/args.rs b/kr2r/src/args.rs index 43ebe18..2a55865 100644 --- a/kr2r/src/args.rs +++ b/kr2r/src/args.rs @@ -1,10 +1,12 @@ // 使用时需要引用模块路径 use crate::utils::expand_spaced_seed_mask; -use crate::{construct_seed_template, parse_binary, Meros, BITS_PER_CHAR}; -use crate::{ - DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, DEFAULT_MINIMIZER_SPACES, DEFAULT_TOGGLE_MASK, -}; +use crate::{construct_seed_template, parse_binary}; use clap::Parser; +use seqkmer::Meros; +use seqkmer::{ + BITS_PER_CHAR, DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, DEFAULT_MINIMIZER_SPACES, + DEFAULT_TOGGLE_MASK, +}; use std::path::PathBuf; pub const U32MAXPLUS: u64 = u32::MAX as u64; @@ -29,30 +31,11 @@ pub struct Build { pub requested_bits_for_taxid: u8, /// Number of threads - #[clap(short = 'p', long, default_value_t = 10)] + #[clap(short = 'p', long, default_value_t = num_cpus::get())] pub threads: usize, } -#[derive(Parser, Debug, Clone)] -#[clap(version, about = "taxonomy")] -pub struct Taxo { - // /// Kraken 2 taxonomy filename, default = $database/taxo.k2d - // #[clap(short = 't')] - // pub taxonomy_filename: Option, - - // #[clap(short = 'm', required = true)] - // pub id_to_taxon_map_filename: PathBuf, - /// Sequence ID to taxon map filename - /// seqid2taxid.map file path, default = $database/seqid2taxid.map - #[arg(short = 'm')] - pub id_to_taxon_map_filename: Option, - - /// NCBI taxonomy directory name, default = $database/taxonomy - #[clap(short, long)] - pub ncbi_taxonomy_directory: Option, -} - -const BATCH_SIZE: usize = 8 * 1024 * 1024; +const BATCH_SIZE: usize = 16 * 1024 * 1024; /// Command line arguments for the classify program. /// @@ -68,18 +51,14 @@ const BATCH_SIZE: usize = 8 * 1024 * 1024; long_about = "classify a set of sequences" )] pub struct ClassifyArgs { - /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + // /// database hash chunk directory and other files + #[arg(long = "db", required = true)] + pub database: PathBuf, /// chunk directory #[clap(long)] pub chunk_dir: PathBuf, - /// Enables use of a Kraken 2 compatible shared database. Default is false. - #[clap(long, default_value_t = false)] - pub kraken_db_type: bool, - /// File path for outputting normal Kraken output. #[clap(long = "output-dir", value_parser)] pub kraken_output_dir: Option, @@ -92,7 +71,7 @@ pub struct ClassifyArgs { #[clap(short = 'S', long = "single-file-pairs", action)] pub single_file_pairs: bool, - /// Minimum quality score for FASTQ data, default is 0. + /// Minimum quality score for FASTQ data. #[clap( short = 'Q', long = "minimum-quality-score", @@ -101,15 +80,15 @@ pub struct ClassifyArgs { )] pub minimum_quality_score: i32, - /// The number of threads to use, default is 10. - #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = 10)] - pub num_threads: i32, + /// The number of threads to use. + #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = num_cpus::get())] + pub num_threads: usize, - /// 批量处理大小 default: 8MB + /// 批量处理大小 default: 16MB #[clap(long, default_value_t = BATCH_SIZE)] pub batch_size: usize, - /// Confidence score threshold, default is 0.0. + /// Confidence score threshold #[clap( short = 'T', long = "confidence-threshold", @@ -127,6 +106,10 @@ pub struct ClassifyArgs { )] pub minimum_hit_groups: usize, + /// Enables use of a Kraken 2 compatible shared database. + #[clap(long, default_value_t = false)] + pub kraken_db_type: bool, + /// In comb. w/ -R, provide minimizer information in report #[clap(short = 'K', long, value_parser, default_value_t = false)] pub report_kmer_data: bool, diff --git a/kr2r/src/bin/annotate.rs b/kr2r/src/bin/annotate.rs index dd26710..5fe5bb4 100644 --- a/kr2r/src/bin/annotate.rs +++ b/kr2r/src/bin/annotate.rs @@ -25,8 +25,8 @@ pub const BATCH_SIZE: usize = 8 * 1024 * 1024; )] pub struct Args { /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + #[arg(long = "db", required = true)] + pub database: PathBuf, /// Enables use of a Kraken 2 compatible shared database. Default is false. #[clap(long, default_value_t = false)] @@ -128,7 +128,8 @@ where .into_par_iter() .filter_map(|slot| { let indx = slot.idx & idx_mask; - let taxid = chtm.get_from_page(indx, slot.value, page_index); + let compacted = slot.value.left(value_bits) as u32; + let taxid = chtm.get_from_page(indx, compacted, page_index); if taxid > 0 { let kmer_id = slot.idx >> idx_bits; @@ -199,7 +200,7 @@ fn process_chunk_file>( let start = Instant::now(); - let config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; let parition = hash_files.len(); let chtm = if args.kraken_db_type { CHTable::from_pair( @@ -230,11 +231,11 @@ fn process_chunk_file>( pub fn run(args: Args) -> Result<()> { let chunk_files = find_and_sort_files(&args.chunk_dir, "sample", ".k2")?; - let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?; + let hash_files = find_and_sort_files(&args.database, "hash", ".k2d")?; // 开始计时 let start = Instant::now(); - println!("start..."); + println!("annotate start..."); for chunk_file in chunk_files { println!("chunk_file {:?}", chunk_file); process_chunk_file(&args, chunk_file, &hash_files)?; diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs index 1eb1c17..f231bd5 100644 --- a/kr2r/src/bin/build_k2_db.rs +++ b/kr2r/src/bin/build_k2_db.rs @@ -1,62 +1,51 @@ // 使用时需要引用模块路径 use clap::Parser; -use kr2r::args::{parse_size, Build, Taxo}; +use kr2r::args::{parse_size, Build}; use kr2r::compact_hash::HashConfig; use kr2r::db::{ convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file, write_config_to_file, }; use kr2r::utils::{ - create_partition_files, create_partition_writers, find_library_fna_files, format_bytes, - get_file_limit, read_id_to_taxon_map, + create_partition_files, create_partition_writers, find_library_fna_files, get_file_limit, + read_id_to_taxon_map, }; use kr2r::IndexOptions; use std::fs::remove_file; -use std::path::PathBuf; use std::time::Instant; #[derive(Parser, Debug, Clone)] #[clap(author, version, about="build database", long_about = None)] pub struct Args { - /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: Option, - + // /// database hash chunk directory and other files + // #[clap(long)] + // pub k2d_dir: Option, #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")] pub hash_capacity: usize, - /// chunk temp directory - #[clap(long)] - pub chunk_dir: PathBuf, - + // chunk temp directory + // #[clap(long)] + // pub chunk_dir: PathBuf, /// 包含原始配置 #[clap(flatten)] pub build: Build, - - #[clap(flatten)] - pub taxo: Taxo, + // #[arg(short = 'm')] + // pub id_to_taxon_map_filename: Option, } pub fn run(args: Args, required_capacity: usize) -> Result<(), Box> { let file_num_limit = get_file_limit(); let meros = args.build.klmt.as_meros(); - let id_to_taxon_map_filename = args - .taxo - .id_to_taxon_map_filename - .unwrap_or(args.build.database.join("seqid2taxid.map")); + let id_to_taxon_map_filename = args.build.database.join("seqid2taxid.map"); let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?; - let source: PathBuf = args.build.database.clone(); - let k2d_dir = args.k2d_dir.unwrap_or(source.clone()); + let k2d_dir = &args.build.database; let taxonomy_filename = k2d_dir.join("taxo.k2d"); - let ncbi_taxonomy_directory = args - .taxo - .ncbi_taxonomy_directory - .unwrap_or(args.build.database.join("taxonomy")); + let ncbi_taxonomy_directory = &args.build.database.join("taxonomy"); let taxonomy = generate_taxonomy( &ncbi_taxonomy_directory, @@ -83,12 +72,10 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box Result<(), Box, + m_iter: &mut MinimizerIterator, hash_config: &HashConfig, chtable: &CHTable, - offset: u32, -) -> (u32, Vec) { + offset: usize, +) -> usize { let chunk_size = hash_config.hash_capacity; let value_bits = hash_config.value_bits; - - let mut rows = Vec::new(); - let mut kmer_count = 0; - for (sort, hash_key) in miner.into_iter().enumerate() { - let idx = hash_config.index(hash_key); + let data: Vec<(usize, u64)> = m_iter.collect(); + for (sort, hash_key) in data { + let (idx, compacted) = hash_config.compact(hash_key); let partition_index = idx / chunk_size; let index = idx % chunk_size; - let taxid = chtable.get_from_page(index, hash_key, partition_index + 1); + + let taxid = chtable.get_from_page(index, compacted, partition_index + 1); if taxid > 0 { - let compacted_key = hash_key.left(value_bits) as u32; - let high = u32::combined(compacted_key, taxid, value_bits); - let row = Row::new(high, 0, sort as u32 + 1 + offset); + let high = u32::combined(compacted, taxid, value_bits); + let row = Row::new(high, 0, sort as u32 + 1 + offset as u32); rows.push(row); } - kmer_count += 1; } - (kmer_count, rows) + m_iter.size + offset } fn process_record( - dna_id: String, - seq1: Vec, - seq2: Option>, + marker: &mut Base, args: &Args, taxonomy: &Taxonomy, - meros: Meros, chtable: &CHTable, hash_config: &HashConfig, cur_taxon_counts: &TaxonCountersDash, classify_counter: &AtomicUsize, ) -> String { - let value_mask = hash_config.value_mask; - let mut seq_len_str = String::new(); - let seq1_len = seq1.len(); - seq_len_str.push_str(&seq1_len.to_string()); - - let scan1 = MinimizerScanner::new(&seq1, meros); - let (kmer_count1, mut rows) = process_seq(scan1, &hash_config, chtable, 0); - let kmer_count2 = if let Some(seq) = seq2 { - let scan2 = MinimizerScanner::new(&seq, meros); - let (kmer_count2, rows2) = process_seq(scan2, &hash_config, chtable, kmer_count1); - rows.extend_from_slice(&rows2); - seq_len_str.push_str(format!("|{}", seq.len()).as_str()); - Some(kmer_count2) - } else { - None - }; - let total_kmers: usize = (kmer_count1 + kmer_count2.unwrap_or(0)) as usize; - let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, kmer_count1); - let hit_string = add_hitlist_string(&rows, value_mask, kmer_count1, kmer_count2, taxonomy); - let mut call = resolve_tree(&counts, taxonomy, total_kmers, args.confidence_threshold); - if call > 0 && hit_groups < args.minimum_hit_groups { - call = 0; - }; + let id = &marker.header.id.clone(); + let rows: Vec = marker + .fold(|rows, m_iter, offset| process_seq(rows, m_iter, &hash_config, chtable, offset)); + + let hits = HitGroup::new(rows, marker.range()); + + let seq_len_str = marker.fmt_seq_size(); + + let required_score = hits.required_score(args.confidence_threshold); + let hit_data = process_hitgroup( + &hits, + taxonomy, + classify_counter, + required_score, + args.minimum_hit_groups, + hash_config.value_mask, + ); - cur_counts.iter().for_each(|entry| { + hit_data.3.iter().for_each(|(key, value)| { cur_taxon_counts - .entry(*entry.key()) + .entry(*key) .or_default() - .merge(entry.value()) + .merge(value) .unwrap(); }); - - let ext_call = taxonomy.nodes[call as usize].external_id; - let clasify = if call > 0 { - classify_counter.fetch_add(1, Ordering::SeqCst); - cur_taxon_counts - .entry(call as u64) - .or_default() - .increment_read_count(); - - "C" - } else { - "U" - }; - // 使用锁来同步写入 - let output_line = format!( + format!( "{}\t{}\t{}\t{}\t{}\n", - clasify, dna_id, ext_call, seq_len_str, hit_string - ); - output_line -} - -fn process_fasta_file( - args: &Args, - meros: Meros, - hash_config: HashConfig, - file_index: usize, - files: &[String], - chtable: &CHTable, - taxonomy: &Taxonomy, - total_taxon_counts: &mut TaxonCounters, -) -> io::Result<(usize, usize)> { - let score = args.minimum_quality_score; - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - - let mut writer: Box = match &args.kraken_output_dir { - Some(ref file_path) => { - let filename = file_path.join(format!("output_{}.txt", file_index)); - let file = File::create(filename)?; - Box::new(BufWriter::new(file)) as Box - } - None => Box::new(io::stdout()) as Box, - }; - - let cur_taxon_counts = TaxonCountersDash::new(); - let sequence_count = AtomicUsize::new(0); - let classify_counter = AtomicUsize::new(0); - - let reader = open_fasta_reader(&file1).expect("Unable to create fasta reader from path"); - read_parallel( - reader, - args.num_threads as u32, - args.num_threads as usize, - |record_set| { - let mut buffer = String::new(); - - for records in record_set.into_iter() { - let dna_id = trim_pair_info(records.id().unwrap_or_default()); - sequence_count.fetch_add(1, Ordering::SeqCst); - - let seq1: Vec = records.seq_x(score); - let seq2 = None; - let output_line = process_record( - dna_id, - seq1, - seq2, - args, - taxonomy, - meros, - chtable, - &hash_config, - &cur_taxon_counts, - &classify_counter, - ); - - buffer.push_str(&output_line); - } - buffer - }, - |record_sets| { - while let Some(Ok((_, buffer))) = record_sets.next() { - writer - .write_all(buffer.as_bytes()) - .expect("write data error"); - } - }, - ); - - let mut sample_taxon_counts: HashMap< - u64, - kr2r::readcounts::ReadCounts>, - > = HashMap::new(); - cur_taxon_counts.iter().for_each(|entry| { - total_taxon_counts - .entry(*entry.key()) - .or_default() - .merge(&entry.value()) - .unwrap(); - sample_taxon_counts - .entry(*entry.key()) - .or_default() - .merge(&entry.value()) - .unwrap(); - }); - - let thread_sequences = sequence_count.load(Ordering::SeqCst); - let thread_classified = classify_counter.load(Ordering::SeqCst); - if let Some(output) = &args.kraken_output_dir { - let filename = output.join(format!("output_{}.kreport2", file_index)); - report_kraken_style( - filename, - args.report_zero_counts, - args.report_kmer_data, - &taxonomy, - &sample_taxon_counts, - thread_sequences as u64, - (thread_sequences - thread_classified) as u64, - )?; - } - - Ok((thread_sequences, thread_sequences - thread_classified)) + hit_data.0, id, hit_data.1, seq_len_str, hit_data.2 + ) } -/// fastq -fn process_fastq_file( +fn process_fastx_file( args: &Args, meros: Meros, hash_config: HashConfig, file_index: usize, - files: &[String], + reader: &mut R, chtable: &CHTable, taxonomy: &Taxonomy, total_taxon_counts: &mut TaxonCounters, -) -> io::Result<(usize, usize)> { - let score = args.minimum_quality_score; - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let file2 = files_iter.next().cloned(); - +) -> io::Result<(usize, usize)> +where + R: Reader, +{ let mut writer: Box = match &args.kraken_output_dir { Some(ref file_path) => { let filename = file_path.join(format!("output_{}.txt", file_index)); @@ -311,45 +174,36 @@ fn process_fastq_file( let cur_taxon_counts = TaxonCountersDash::new(); - let sequence_count = AtomicUsize::new(0); + let seq_counter = AtomicUsize::new(0); let classify_counter = AtomicUsize::new(0); - let reader = seq::PairFastqReader::from_path(&file1, file2.as_ref()) - .expect("Unable to create pair reader from paths"); - read_parallel( + let _ = read_parallel( reader, - args.num_threads as u32, - args.num_threads as usize, - |record_set| { + args.num_threads, + &meros, + |seqs| { let mut buffer = String::new(); - - for records in record_set.into_iter() { - let dna_id = trim_pair_info(records.0.id().unwrap_or_default()); - sequence_count.fetch_add(1, Ordering::SeqCst); - let seq1: Vec = records.0.seq_x(score); - let seq2 = records.1.map(|seq| seq.seq_x(score)); + for record in seqs { + seq_counter.fetch_add(1, Ordering::SeqCst); let output_line = process_record( - dna_id, - seq1, - seq2, + record, args, taxonomy, - meros, chtable, &hash_config, &cur_taxon_counts, &classify_counter, ); - buffer.push_str(&output_line); } - buffer + + Some(buffer) }, - |record_sets| { - while let Some(Ok((_, buffer))) = record_sets.next() { + |dataset| { + while let Some(Some(res)) = dataset.next() { writer - .write_all(buffer.as_bytes()) - .expect("write data error"); + .write_all(res.as_bytes()) + .expect("Failed to write date to file"); } }, ); @@ -371,7 +225,7 @@ fn process_fastq_file( .unwrap(); }); - let thread_sequences = sequence_count.load(Ordering::SeqCst); + let thread_sequences = seq_counter.load(Ordering::SeqCst); let thread_classified = classify_counter.load(Ordering::SeqCst); if let Some(output) = &args.kraken_output_dir { let filename = output.join(format!("output_{}.kreport2", file_index)); @@ -420,36 +274,22 @@ fn process_files( writeln!(file_writer, "{}\t{}", file_index, file_pair.join(","))?; file_writer.flush().unwrap(); - match detect_file_format(&file_pair[0])? { - FileFormat::Fastq => { - let (thread_sequences, thread_unclassified) = process_fastq_file( - &args, - meros, - hash_config, - file_index, - file_pair, - chtable, - taxonomy, - &mut total_taxon_counts, - )?; - total_seqs += thread_sequences; - total_unclassified += thread_unclassified; - } - FileFormat::Fasta => { - let (thread_sequences, thread_unclassified) = process_fasta_file( - &args, - meros, - hash_config, - file_index, - file_pair, - chtable, - taxonomy, - &mut total_taxon_counts, - )?; - total_seqs += thread_sequences; - total_unclassified += thread_unclassified; - } - } + let score = args.minimum_quality_score; + let paths = OptionPair::from_slice(file_pair); + let mut reader = FastxReader::from_paths(paths, file_index, score)?; + // let mut reader = create_reader(file_pair, file_index, score)?; + let (thread_sequences, thread_unclassified) = process_fastx_file( + &args, + meros, + hash_config, + file_index, + &mut reader, + chtable, + taxonomy, + &mut total_taxon_counts, + )?; + total_seqs += thread_sequences; + total_unclassified += thread_unclassified; } if let Some(output) = &args.kraken_output_dir { let filename = output.join("output.kreport2"); @@ -480,7 +320,7 @@ fn process_files( } pub fn run(args: Args) -> Result<()> { - let options_filename = &args.k2d_dir.join("opts.k2d"); + let options_filename = &args.database.join("opts.k2d"); let idx_opts = IndexOptions::read_index_options(options_filename)?; if args.paired_end_processing && !args.single_file_pairs && args.input_files.len() % 2 != 0 { @@ -491,19 +331,19 @@ pub fn run(args: Args) -> Result<()> { )); } - let taxonomy_filename = args.k2d_dir.join("taxo.k2d"); + let taxonomy_filename = args.database.join("taxo.k2d"); let taxo = Taxonomy::from_file(taxonomy_filename)?; - let hash_config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; - println!("hash_config {:?}", hash_config); + println!("{:?}", hash_config); if hash_config.hash_capacity == 0 { panic!("`hash_capacity` can't be zero!"); } - println!("start..."); + println!("classify start..."); let start = Instant::now(); let meros = idx_opts.as_meros(); - let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?; + let hash_files = find_and_sort_files(&args.database, "hash", ".k2d")?; let chtable = CHTable::from_hash_files(hash_config, hash_files)?; process_files(args, meros, hash_config, &chtable, &taxo)?; diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs index c7c34d8..408ea33 100644 --- a/kr2r/src/bin/estimate_capacity.rs +++ b/kr2r/src/bin/estimate_capacity.rs @@ -1,11 +1,10 @@ use clap::{error::ErrorKind, Error, Parser}; use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; use kr2r::args::KLMTArgs; -use kr2r::mmscanner::MinimizerScanner; use kr2r::utils::{find_library_fna_files, format_bytes, open_file}; use kr2r::KBuildHasher; -use seq_io::fasta::{Reader, Record}; -use seq_io::parallel::read_parallel; + +use seqkmer::{read_parallel, FastaReader}; use serde_json; use std::collections::HashSet; use std::fs::File; @@ -49,8 +48,8 @@ pub struct Args { const RANGE_SECTIONS: u64 = 1024; const RANGE_MASK: u64 = RANGE_SECTIONS - 1; -fn build_output_path(input_path: &str, extension: &str) -> String { - let path = Path::new(input_path); +fn build_output_path>(input_path: &P, extension: &str) -> String { + let path = input_path.as_ref(); let parent_dir = path.parent().unwrap_or_else(|| Path::new("")); let stem = path.file_stem().unwrap_or_else(|| path.as_os_str()); @@ -60,8 +59,8 @@ fn build_output_path(input_path: &str, extension: &str) -> String { output_path.to_str().unwrap().to_owned() } -fn process_sequence( - fna_file: &str, +fn process_sequence>( + fna_file: &P, // hllp: &mut HyperLogLogPlus, args: Args, ) -> HyperLogLogPlus { @@ -84,33 +83,36 @@ fn process_sequence( let mut hllp: HyperLogLogPlus = HyperLogLogPlus::new(16, KBuildHasher::default()).unwrap(); - let reader = Reader::from_path(fna_file).unwrap(); + let mut reader = FastaReader::from_path(fna_file, 1).unwrap(); let range_n = args.n as u64; read_parallel( - reader, - args.threads as u32, - args.threads - 2 as usize, + &mut reader, + args.threads, + &meros, |record_set| { let mut minimizer_set = HashSet::new(); - for record in record_set.into_iter() { - let seq = record.seq(); - let kmer_iter = MinimizerScanner::new(&seq, meros) - .into_iter() - .filter(|hash_key| hash_key & RANGE_MASK < range_n) - .collect::>(); - - minimizer_set.extend(kmer_iter); + + for record in record_set { + record.body.apply_mut(|m_iter| { + let kmer_iter: HashSet = m_iter + .filter(|(_, hash_key)| *hash_key & RANGE_MASK < range_n) + .map(|(_, hash_key)| hash_key) + .collect(); + + minimizer_set.extend(kmer_iter); + }); } - minimizer_set + Some(minimizer_set) }, |record_sets| { - while let Some(Ok((_, m_set))) = record_sets.next() { + while let Some(Some(m_set)) = record_sets.next() { for minimizer in m_set { hllp.insert(&minimizer); } } }, - ); + ) + .expect("read parallel error"); // 序列化 hllp 对象并将其写入文件 let serialized_hllp = serde_json::to_string(&hllp).unwrap(); @@ -140,7 +142,7 @@ pub fn run(args: Args) -> usize { let source: PathBuf = args.database.clone(); let fna_files = if source.is_file() { - vec![source.to_string_lossy().to_string()] + vec![source.clone()] } else { find_library_fna_files(args.database) }; @@ -149,6 +151,8 @@ pub fn run(args: Args) -> usize { panic!("Error: No library.fna files found in the specified directory. Please ensure that the directory contains at least one library.fna file and try again."); } + println!("estimate start... "); + for fna_file in fna_files { let args_clone = Args { database: source.clone(), diff --git a/kr2r/src/bin/hashshard.rs b/kr2r/src/bin/hashshard.rs index 8fce0d8..64150fe 100644 --- a/kr2r/src/bin/hashshard.rs +++ b/kr2r/src/bin/hashshard.rs @@ -53,10 +53,9 @@ pub struct Args { #[clap(long = "db", value_parser, required = true)] database: PathBuf, - /// database hash chunk directory and other files - #[clap(long)] - k2d_dir: Option, - + // /// database hash chunk directory and other files + // #[clap(long)] + // k2d_dir: Option, /// Specifies the hash file capacity. Acceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K'). /// Note: The specified capacity affects the index size, with a factor of 4 applied. For example, specifying '1G' results in an index size of '4G'. /// Default: 1G (capacity 1G = file size 4G) @@ -69,14 +68,14 @@ pub fn run(args: Args) -> IOResult<()> { let hash_config = HashConfig::from_hash_header(index_filename)?; let partition = (hash_config.capacity + args.hash_capacity - 1) / args.hash_capacity; - println!("start..."); + println!("hashshard start..."); // 开始计时 let start = Instant::now(); let file_len = hash_config.capacity * 4 + 32; let b_size = std::mem::size_of::(); - let k2d_dir = args.k2d_dir.unwrap_or(args.database.clone()); + let k2d_dir = args.database.clone(); create_dir_all(&k2d_dir).expect(&format!("create hash dir error {:?}", k2d_dir)); diff --git a/kr2r/src/bin/kun.rs b/kr2r/src/bin/kun.rs index 756181e..65b097f 100644 --- a/kr2r/src/bin/kun.rs +++ b/kr2r/src/bin/kun.rs @@ -4,12 +4,13 @@ mod build_k2_db; mod classify; mod estimate_capacity; mod hashshard; +mod merge_fna; mod resolve; -mod seqid2taxid; +// mod seqid2taxid; mod splitr; use kr2r::args::ClassifyArgs; -use kr2r::args::{parse_size, Build, Taxo}; +use kr2r::args::{parse_size, Build}; use kr2r::utils::find_and_sort_files; // use std::io::Result; use std::path::PathBuf; @@ -18,23 +19,21 @@ use std::time::Instant; #[derive(Parser, Debug, Clone)] #[clap(author, version, about="build database", long_about = None)] struct BuildArgs { - /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: Option, - - /// chunk directory - #[clap(long)] - chunk_dir: PathBuf, - - #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")] - pub hash_capacity: usize, - + // /// database hash chunk directory and other files + // #[clap(long)] + // pub k2d_dir: Option, + /// Directory to store downloaded files + #[arg(short, long, required = true)] + pub download_dir: PathBuf, + + // chunk_dir: PathBuf, #[clap(flatten)] pub build: Build, - #[clap(flatten)] - taxo: Taxo, - + // #[arg(short = 'm')] + // pub id_to_taxon_map_filename: Option, + // #[clap(flatten)] + // taxo: Taxo, /// estimate capacity from cache if exists #[arg(long, default_value_t = true)] cache: bool, @@ -60,7 +59,7 @@ struct Args { impl From for splitr::Args { fn from(item: ClassifyArgs) -> Self { Self { - k2d_dir: item.k2d_dir, + database: item.database, paired_end_processing: item.paired_end_processing, single_file_pairs: item.single_file_pairs, minimum_quality_score: item.minimum_quality_score, @@ -74,7 +73,7 @@ impl From for splitr::Args { impl From for annotate::Args { fn from(item: ClassifyArgs) -> Self { Self { - k2d_dir: item.k2d_dir, + database: item.database, chunk_dir: item.chunk_dir, batch_size: item.batch_size, kraken_db_type: item.kraken_db_type, @@ -85,7 +84,7 @@ impl From for annotate::Args { impl From for resolve::Args { fn from(item: ClassifyArgs) -> Self { Self { - k2d_dir: item.k2d_dir, + database: item.database, chunk_dir: item.chunk_dir, batch_size: item.batch_size, confidence_threshold: item.confidence_threshold, @@ -115,19 +114,25 @@ impl From for build_k2_db::Args { fn from(item: BuildArgs) -> Self { Self { build: item.build, - k2d_dir: item.k2d_dir, - taxo: item.taxo, - chunk_dir: item.chunk_dir, - hash_capacity: item.hash_capacity, + hash_capacity: parse_size("1G").unwrap(), } } } -impl From for seqid2taxid::Args { +// impl From for seqid2taxid::Args { +// fn from(item: BuildArgs) -> Self { +// Self { +// database: item.build.database, +// id_to_taxon_map_filename: item.taxo.id_to_taxon_map_filename, +// } +// } +// } + +impl From for merge_fna::Args { fn from(item: BuildArgs) -> Self { Self { + download_dir: item.download_dir, database: item.build.database, - id_to_taxon_map_filename: item.taxo.id_to_taxon_map_filename, } } } @@ -135,7 +140,7 @@ impl From for seqid2taxid::Args { #[derive(Subcommand, Debug)] enum Commands { Estimate(estimate_capacity::Args), - Seqid2taxid(seqid2taxid::Args), + // Seqid2taxid(seqid2taxid::Args), Build(BuildArgs), Hashshard(hashshard::Args), Splitr(splitr::Args), @@ -143,21 +148,25 @@ enum Commands { Resolve(resolve::Args), Classify(ClassifyArgs), Direct(classify::Args), + MergeFna(merge_fna::Args), } fn main() -> Result<(), Box> { let args = Args::parse(); match args.cmd { + Commands::MergeFna(cmd_args) => { + merge_fna::run(cmd_args)?; + } Commands::Estimate(cmd_args) => { estimate_capacity::run(cmd_args); } - Commands::Seqid2taxid(cmd_args) => { - seqid2taxid::run(cmd_args)?; - } + // Commands::Seqid2taxid(cmd_args) => { + // seqid2taxid::run(cmd_args)?; + // } Commands::Build(cmd_args) => { - let seq_args = seqid2taxid::Args::from(cmd_args.clone()); - seqid2taxid::run(seq_args)?; + let fna_args = merge_fna::Args::from(cmd_args.clone()); + merge_fna::run(fna_args)?; let ec_args = estimate_capacity::Args::from(cmd_args.clone()); let required_capacity = estimate_capacity::run(ec_args); @@ -181,10 +190,15 @@ fn main() -> Result<(), Box> { let splitr_args = splitr::Args::from(cmd_args.clone()); let chunk_files = find_and_sort_files(&splitr_args.chunk_dir, "sample", ".k2")?; - if !chunk_files.is_empty() { + let sample_files = find_and_sort_files(&splitr_args.chunk_dir, "sample", ".map")?; + let bin_files = find_and_sort_files(&splitr_args.chunk_dir, "sample", ".map")?; + if !chunk_files.is_empty() || !sample_files.is_empty() || !bin_files.is_empty() { return Err(Box::new(std::io::Error::new( std::io::ErrorKind::Other, - format!("{} must be empty", &splitr_args.chunk_dir.display()), + format!( + "{} `sample` files must be empty", + &splitr_args.chunk_dir.display() + ), ))); } splitr::run(splitr_args)?; diff --git a/kr2r/src/bin/merge_fna.rs b/kr2r/src/bin/merge_fna.rs new file mode 100644 index 0000000..089d81b --- /dev/null +++ b/kr2r/src/bin/merge_fna.rs @@ -0,0 +1,225 @@ +use clap::Parser; + +use flate2::read::GzDecoder; +use kr2r::utils::{find_files, open_file}; +use std::collections::HashMap; +use std::fs::{create_dir_all, File, OpenOptions}; +use std::io::{BufRead, BufReader, BufWriter, Result, Write}; +use std::path::PathBuf; +use std::time::Instant; + +#[derive(Parser, Debug, Clone)] +#[clap(version, about = "A tool for processing genomic files")] +pub struct Args { + /// Directory to store downloaded files + #[arg(short, long, default_value = "lib")] + pub download_dir: PathBuf, + + /// ncbi library fna database directory + #[arg(long = "db", required = true)] + pub database: PathBuf, + // /// seqid2taxid.map file path, default = $database/seqid2taxid.map + // #[arg(short = 'm', long)] + // pub id_to_taxon_map_filename: Option, +} + +fn parse_assembly_fna(assembly_file: &PathBuf, site: &str) -> Result> { + let mut gz_files: HashMap = HashMap::new(); + let file = open_file(&assembly_file)?; + let reader = BufReader::new(file); + let lines = reader.lines(); + + let parent_path = assembly_file + .parent() + .expect("Can't find assembly file parent directory"); + for line in lines { + let line = line?; + if line.starts_with('#') { + continue; + } + + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() > 19 { + let (taxid, _, ftp_path) = (fields[5], fields[11], fields[19]); + + if ftp_path == "na" { + continue; + } + + // let levels = vec!["Complete Genome", "Chromosome"]; + // if !levels.contains(&asm_level) { + // continue; + // } + + let fna_file_name = format!( + "{}/{}/{}_genomic.fna.gz", + parent_path.to_string_lossy(), + site, + ftp_path.split('/').last().unwrap_or_default() + ); + gz_files.insert(fna_file_name, taxid.into()); + } + } + Ok(gz_files) +} + +fn process_gz_file( + gz_file: &PathBuf, + map_writer: &mut BufWriter, + fna_writer: &mut BufWriter, + fna_start: ®ex::Regex, + taxid: &str, +) -> Result<()> { + let file = open_file(gz_file)?; + let decompressor = GzDecoder::new(BufReader::new(file)); + let mut reader = BufReader::new(decompressor); + + let mut line = String::new(); + let mut map_buffer = String::new(); // Buffer for map writer + let mut fna_buffer = String::new(); // Buffer for fna writer + + while reader.read_line(&mut line)? != 0 { + if let Some(caps) = fna_start.captures(&line) { + let seqid = &caps[1]; + map_buffer.push_str(&format!("kraken:taxid|{}|{}\t{}\n", taxid, seqid, taxid)); + fna_buffer.push_str(&format!(">kraken:taxid|{}|{}", taxid, &line[1..])); + } else { + fna_buffer.push_str(&line); + } + + // Write to the writers if the buffer size exceeds a certain threshold + if map_buffer.len() > 10000 { + map_writer.write_all(map_buffer.as_bytes())?; + map_buffer.clear(); + } + + if fna_buffer.len() > 10000 { + fna_writer.write_all(fna_buffer.as_bytes())?; + fna_buffer.clear(); + } + + line.clear(); + } + + // Write any remaining buffered content + if !map_buffer.is_empty() { + map_writer.write_all(map_buffer.as_bytes())?; + } + + if !fna_buffer.is_empty() { + fna_writer.write_all(fna_buffer.as_bytes())?; + } + + fna_writer.flush()?; + map_writer.flush()?; + + Ok(()) +} + +const PREFIX: &'static str = "assembly_summary"; +const SUFFIX: &'static str = "txt"; + +fn merge_fna(assembly_files: &Vec, database: &PathBuf) -> Result<()> { + let pattern = format!(r"{}_(\S+)\.{}", PREFIX, SUFFIX); + let file_site = regex::Regex::new(&pattern).unwrap(); + + let library_fna_path = database.join("library.fna"); + let seqid2taxid_path = database.join("seqid2taxid.map"); + let mut fna_writer = BufWriter::new( + OpenOptions::new() + .create(true) + .write(true) + .open(&library_fna_path)?, + ); + let mut map_writer = BufWriter::new( + OpenOptions::new() + .create(true) + .write(true) + .open(&seqid2taxid_path)?, + ); + + let fna_start: regex::Regex = regex::Regex::new(r"^>(\S+)").unwrap(); + for assembly_file in assembly_files { + if let Some(caps) = file_site.captures(assembly_file.to_string_lossy().as_ref()) { + if let Some(matched) = caps.get(1) { + let gz_files = parse_assembly_fna(assembly_file, matched.as_str())?; + + for (gz_path, taxid) in gz_files { + let gz_file = PathBuf::from(&gz_path); + if !gz_file.exists() { + // eprintln!("{} does not exist", gz_file.to_string_lossy()); + continue; + } + + process_gz_file( + &gz_file, + &mut map_writer, + &mut fna_writer, + &fna_start, + &taxid, + )?; + } + + fna_writer.flush()?; + map_writer.flush()?; + } + } + } + + Ok(()) +} + +pub fn run(args: Args) -> Result<()> { + // 开始计时 + let start = Instant::now(); + println!("merge fna start..."); + let download_dir = args.download_dir; + let database = &args.database; + + let dst_tax_dir = database.join("taxonomy"); + create_dir_all(&dst_tax_dir)?; + + let source_names_file = &download_dir.join("taxonomy").join("names.dmp"); + assert!(source_names_file.exists()); + let dst_name_file = &dst_tax_dir.join("names.dmp"); + if !dst_name_file.exists() { + std::fs::copy(source_names_file, dst_name_file)?; + } + + let source_nodes_file = &download_dir.join("taxonomy").join("nodes.dmp"); + assert!(source_nodes_file.exists()); + let dst_nodes_file = &dst_tax_dir.join("nodes.dmp"); + if !dst_nodes_file.exists() { + std::fs::copy(source_nodes_file, dst_nodes_file)?; + } + + let library_fna_path = database.join("library.fna"); + let seqid2taxid_path = database.join("seqid2taxid.map"); + if library_fna_path.exists() && seqid2taxid_path.exists() { + println!("library.fna and seqid2taxid.map exists!"); + return Ok(()); + } + + if library_fna_path.exists() { + std::fs::remove_file(library_fna_path)?; + } + if seqid2taxid_path.exists() { + std::fs::remove_file(seqid2taxid_path)?; + } + let assembly_files = find_files(&download_dir, &PREFIX, &SUFFIX); + + merge_fna(&assembly_files, &args.database)?; + + // 计算持续时间 + let duration = start.elapsed(); + println!("merge fna took: {:?}", duration); + Ok(()) +} + +#[allow(dead_code)] +fn main() { + let args = Args::parse(); + if let Err(e) = run(args) { + eprintln!("Application error: {}", e); + } +} diff --git a/kr2r/src/bin/resolve.rs b/kr2r/src/bin/resolve.rs index a22f559..2a86376 100644 --- a/kr2r/src/bin/resolve.rs +++ b/kr2r/src/bin/resolve.rs @@ -1,12 +1,14 @@ use clap::Parser; use dashmap::{DashMap, DashSet}; -use kr2r::classify::{add_hitlist_string, count_values, resolve_tree, trim_pair_info}; +use kr2r::classify::process_hitgroup; use kr2r::compact_hash::{HashConfig, Row}; use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{find_and_sort_files, open_file}; +use kr2r::HitGroup; use rayon::prelude::*; +use seqkmer::{trim_pair_info, OptionPair}; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}; @@ -19,7 +21,7 @@ const BATCH_SIZE: usize = 8 * 1024 * 1024; pub fn read_id_to_seq_map>( filename: P, -) -> Result)>> { +) -> Result)>> { let file = open_file(filename)?; let reader = BufReader::new(file); let id_map = DashMap::new(); @@ -34,9 +36,9 @@ pub fn read_id_to_seq_map>( let seq_id = parts[1].to_string(); let seq_size = parts[2].to_string(); let count_parts: Vec<&str> = parts[3].split('|').collect(); - let kmer_count1 = count_parts[0].parse::().unwrap(); + let kmer_count1 = count_parts[0].parse::().unwrap(); let kmer_count2 = if count_parts.len() > 1 { - count_parts[1].parse::().map_or(None, |i| Some(i)) + count_parts[1].parse::().map_or(None, |i| Some(i)) } else { None }; @@ -56,8 +58,8 @@ pub fn read_id_to_seq_map>( )] pub struct Args { /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + #[arg(long = "db", required = true)] + pub database: PathBuf, /// chunk directory #[clap(long, value_parser, required = true)] @@ -105,7 +107,7 @@ fn process_batch>( sample_file: P, args: &Args, taxonomy: &Taxonomy, - id_map: &DashMap)>, + id_map: &DashMap)>, writer: &Mutex>, value_mask: usize, ) -> Result<(TaxonCountersDash, usize, DashSet)> { @@ -147,39 +149,50 @@ fn process_batch>( hit_counts.into_par_iter().for_each(|(k, mut rows)| { if let Some(item) = id_map.get(&k) { rows.sort_unstable(); - let total_kmers: usize = item.2 as usize + item.3.unwrap_or(0) as usize; let dna_id = trim_pair_info(&item.0); - let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, item.2); - let hit_string = add_hitlist_string(&rows, value_mask, item.2, item.3, taxonomy); - let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold); - if call > 0 && hit_groups < minimum_hit_groups { - call = 0; - }; - - cur_counts.iter().for_each(|entry| { + let range = OptionPair::from(((0, item.2), item.3.map(|size| (item.2, size + item.2)))); + let hits = HitGroup::new(rows, range); + + let hit_data = process_hitgroup( + &hits, + taxonomy, + &classify_counter, + hits.required_score(confidence_threshold), + minimum_hit_groups, + value_mask, + ); + // let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, item.2); + // let hit_string = add_hitlist_string(&rows, value_mask, item.2, item.3, taxonomy); + // let require_score = (confidence_threshold * total_kmers as f64).ceil() as u64; + // let mut call = resolve_tree(&counts, taxonomy, require_score); + // if call > 0 && hit_groups < minimum_hit_groups { + // call = 0; + // }; + + hit_data.3.iter().for_each(|(key, value)| { cur_taxon_counts - .entry(*entry.key()) + .entry(*key) .or_default() - .merge(entry.value()) + .merge(value) .unwrap(); }); - let ext_call = taxonomy.nodes[call as usize].external_id; - let clasify = if call > 0 { - classify_counter.fetch_add(1, Ordering::SeqCst); - cur_taxon_counts - .entry(call as u64) - .or_default() - .increment_read_count(); - - "C" - } else { - "U" - }; + // let ext_call = taxonomy.nodes[call as usize].external_id; + // let clasify = if call > 0 { + // classify_counter.fetch_add(1, Ordering::SeqCst); + // cur_taxon_counts + // .entry(call as u64) + // .or_default() + // .increment_read_count(); + + // "C" + // } else { + // "U" + // }; // 使用锁来同步写入 let output_line = format!( "{}\t{}\t{}\t{}\t{}\n", - clasify, dna_id, ext_call, item.1, hit_string + hit_data.0, dna_id, hit_data.1, item.1, hit_data.2 ); let mut file = writer.lock().unwrap(); file.write_all(output_line.as_bytes()).unwrap(); @@ -193,7 +206,7 @@ fn process_batch>( } pub fn run(args: Args) -> Result<()> { - let k2d_dir = &args.k2d_dir; + let k2d_dir = &args.database; let taxonomy_filename = k2d_dir.join("taxo.k2d"); let taxo = Taxonomy::from_file(taxonomy_filename)?; @@ -201,7 +214,7 @@ pub fn run(args: Args) -> Result<()> { let sample_id_files = find_and_sort_files(&args.chunk_dir, "sample_id", ".map")?; let partition = sample_files.len(); - let hash_config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; let value_mask = hash_config.value_mask; let mut total_taxon_counts = TaxonCounters::new(); @@ -210,7 +223,7 @@ pub fn run(args: Args) -> Result<()> { // 开始计时 let start = Instant::now(); - println!("start..."); + println!("resolve start..."); for i in 0..partition { let sample_file = &sample_files[i]; @@ -240,8 +253,13 @@ pub fn run(args: Args) -> Result<()> { .filter(|item| !hit_seq_set.contains(item.key())) .for_each(|item| { let dna_id = trim_pair_info(&item.0); - let hit_string = add_hitlist_string(&vec![], value_mask, item.2, item.3, &taxo); - let output_line = format!("U\t{}\t0\t{}\t{}\n", dna_id, item.1, hit_string); + let output_line = format!( + "U\t{}\t0\t{}\t{}\n", + dna_id, + item.1, + if item.3.is_none() { "" } else { " |:| " } + ); + let mut file = writer.lock().unwrap(); file.write_all(output_line.as_bytes()).unwrap(); }); diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index 372fd8c..cdf2462 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -1,22 +1,16 @@ +use clap::Parser; use kr2r::compact_hash::{HashConfig, Slot}; -use kr2r::mmscanner::MinimizerScanner; -use kr2r::seq::{self, open_fasta_reader, SeqX}; use kr2r::utils::{ - create_partition_files, create_partition_writers, create_sample_file, detect_file_format, - get_file_limit, get_lastest_file_index, FileFormat, + create_partition_files, create_partition_writers, create_sample_file, get_file_limit, + get_lastest_file_index, }; -use kr2r::{IndexOptions, Meros}; -use seq_io::fasta::Record; -use seq_io::fastq::Record as FqRecord; -use seq_io::parallel::read_parallel; +use kr2r::IndexOptions; +use seqkmer::{read_parallel, FastxReader, Meros, MinimizerIterator, OptionPair, Reader}; use std::fs; use std::io::{BufWriter, Write}; use std::io::{Error, ErrorKind, Result}; use std::path::PathBuf; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Instant; - -use clap::Parser; /// Command line arguments for the splitr program. /// /// This structure defines the command line arguments that are accepted by the splitr program. @@ -29,8 +23,8 @@ use clap::Parser; )] pub struct Args { /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + #[arg(long = "db", required = true)] + pub database: PathBuf, // /// The file path for the Kraken 2 options. // #[clap(short = 'o', long = "options-filename", value_parser, required = true)] @@ -43,7 +37,7 @@ pub struct Args { #[clap(short = 'S', long = "single-file-pairs", action)] pub single_file_pairs: bool, - /// Minimum quality score for FASTQ data, default is 0. + /// Minimum quality score for FASTQ data. #[clap( short = 'Q', long = "minimum-quality-score", @@ -52,9 +46,9 @@ pub struct Args { )] pub minimum_quality_score: i32, - /// The number of threads to use, default is 10. - #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = 10)] - pub num_threads: i32, + /// The number of threads to use. + #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = num_cpus::get())] + pub num_threads: usize, /// chunk directory #[clap(long)] @@ -100,31 +94,23 @@ fn init_chunk_writers( } /// 处理record -fn process_record( - iter: I, +fn process_record( + k2_slot_list: &mut Vec<(usize, Slot)>, + marker: &mut MinimizerIterator, hash_config: &HashConfig, - seq_id: u64, chunk_size: usize, + seq_id: u64, idx_bits: usize, - seq_index: &AtomicUsize, -) -> (usize, Vec<(usize, Slot)>) -where - I: Iterator, -{ - let mut k2_slot_list = Vec::new(); - let mut kmer_count = 0; - - for hash_key in iter.into_iter() { +) { + let offset = k2_slot_list.len(); + for (sort, hash_key) in marker { let mut slot = hash_config.slot_u64(hash_key, seq_id); - let seq_sort = seq_index.fetch_add(1, Ordering::SeqCst); + let seq_sort = sort + offset; let partition_index = slot.idx / chunk_size; slot.idx = seq_sort << idx_bits | (slot.idx % chunk_size); - k2_slot_list.push((partition_index, slot)); - kmer_count += 1; } - (kmer_count, k2_slot_list) } fn write_data_to_file( @@ -144,153 +130,65 @@ fn write_data_to_file( sample_writer.write_all(k2_map.as_bytes()).unwrap(); } -fn process_fastq_file( +fn process_fastx_file( args: &Args, meros: Meros, hash_config: HashConfig, file_index: usize, - files: &[String], + reader: &mut R, writers: &mut Vec>, sample_writer: &mut BufWriter, -) { +) -> Result<()> +where + R: Reader, +{ let chunk_size = hash_config.hash_capacity; let idx_bits = ((chunk_size as f64).log2().ceil() as usize).max(1); let slot_size = std::mem::size_of::>(); - let score = args.minimum_quality_score; - - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let file2 = files_iter.next().cloned(); - let line_index = AtomicUsize::new(0); - - let reader = seq::PairFastqReader::from_path(&file1, file2.as_ref()) - .expect("Unable to create pair reader from paths"); read_parallel( reader, - args.num_threads as u32, args.num_threads as usize, - |record_set| { - let mut k2_slot_list = Vec::new(); - + &meros, + |seqs| { let mut buffer = String::new(); - - for records in record_set.into_iter() { - let dna_id = records.0.id().unwrap_or_default().to_string(); - // 拼接seq_id - let index = line_index.fetch_add(1, Ordering::SeqCst); + let mut k2_slot_list = Vec::new(); + for seq in seqs { + let mut init: Vec<(usize, Slot)> = Vec::new(); + let header = &seq.header; + let index = header.reads_index; + let dna_id = header.id.clone(); let seq_id = (file_index << 32 | index) as u64; - let seq_index = AtomicUsize::new(0); - - let seq1 = records.0.seq_x(score); - let scan1 = MinimizerScanner::new(&seq1, meros); - - let (kmer_count1, slot_list1) = process_record( - scan1, - &hash_config, - seq_id, - chunk_size, - idx_bits, - &seq_index, - ); - k2_slot_list.extend(slot_list1); - let (kmer_count, seq_size) = if let Some(record3) = records.1 { - let seq2 = record3.seq_x(score); - let scan2 = MinimizerScanner::new(&seq2, meros); - let (kmer_count2, slot_list2) = process_record( - scan2, + seq.body.apply_mut(|m_iter| { + process_record( + &mut init, + m_iter, &hash_config, - seq_id, chunk_size, + seq_id, idx_bits, - &seq_index, ); - k2_slot_list.extend(slot_list2); - ( - format!("{}|{}", kmer_count1, kmer_count2), - format!("{}|{}", seq1.len(), seq2.len()), - ) - } else { - (kmer_count1.to_string(), format!("{}", seq1.len())) - }; + }); + k2_slot_list.extend_from_slice(&init); + let size_str = seq.fmt_size(); + let seq_size_str = seq.fmt_seq_size(); buffer.push_str( - format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_size, kmer_count).as_str(), + format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_size_str, size_str).as_str(), ); } - (buffer, k2_slot_list) + Some((buffer, k2_slot_list)) }, - |record_sets| { - while let Some(Ok((_, (k2_map, k2_slot_list)))) = record_sets.next() { - write_data_to_file(k2_map, k2_slot_list, writers, slot_size, sample_writer); + |dataset| { + while let Some(Some((buffer, k2_slot_list))) = dataset.next() { + write_data_to_file(buffer, k2_slot_list, writers, slot_size, sample_writer); } }, ) -} - -fn process_fasta_file( - args: &Args, - meros: Meros, - hash_config: HashConfig, - file_index: usize, - files: &[String], - writers: &mut Vec>, - sample_writer: &mut BufWriter, -) { - let chunk_size = hash_config.hash_capacity; - let idx_bits = ((chunk_size as f64).log2().ceil() as usize).max(1); - let slot_size = std::mem::size_of::>(); - let score = args.minimum_quality_score; - - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - - let line_index = AtomicUsize::new(0); - - let reader = open_fasta_reader(&file1).expect("Unable to create fasta reader from path"); - read_parallel( - reader, - args.num_threads as u32, - args.num_threads as usize, - |record_set| { - let mut k2_slot_list = Vec::new(); - - let mut buffer = String::new(); + .expect("failed"); - for records in record_set.into_iter() { - let dna_id = records.id().unwrap_or_default().to_string(); - // 拼接seq_id - let index = line_index.fetch_add(1, Ordering::SeqCst); - let seq_id = (file_index << 32 | index) as u64; - let seq_index = AtomicUsize::new(0); - - let seq1 = records.seq_x(score); - let scan1 = MinimizerScanner::new(&seq1, meros); - - let (kmer_count1, slot_list) = process_record( - scan1, - &hash_config, - seq_id, - chunk_size, - idx_bits, - &seq_index, - ); - - k2_slot_list.extend(slot_list); - let (kmer_count, seq_size) = (kmer_count1.to_string(), format!("{}", seq1.len())); - buffer.push_str( - format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_size, kmer_count).as_str(), - ); - } - (buffer, k2_slot_list) - }, - |record_sets| { - while let Some(Ok((_, (k2_map, k2_slot_list)))) = record_sets.next() { - write_data_to_file(k2_map, k2_slot_list, writers, slot_size, sample_writer); - } - }, - ) + Ok(()) } fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { @@ -322,30 +220,19 @@ fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { let mut sample_writer = create_sample_file(args.chunk_dir.join(format!("sample_id_{}.map", file_index))); - match detect_file_format(&file_pair[0])? { - FileFormat::Fastq => { - process_fastq_file( - &args, - meros, - hash_config, - file_index, - file_pair, - &mut writers, - &mut sample_writer, - ); - } - FileFormat::Fasta => { - process_fasta_file( - &args, - meros, - hash_config, - file_index, - file_pair, - &mut writers, - &mut sample_writer, - ); - } - } + let score = args.minimum_quality_score; + let paths = OptionPair::from_slice(file_pair); + let mut reader = FastxReader::from_paths(paths, file_index, score)?; + process_fastx_file( + &args, + meros, + hash_config, + file_index, + &mut reader, + &mut writers, + &mut sample_writer, + ) + .expect("process fastx file error"); } Ok(()) }; @@ -364,7 +251,7 @@ fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { pub fn run(args: Args) -> Result<()> { // let args = Args::parse(); - let options_filename = &args.k2d_dir.join("opts.k2d"); + let options_filename = &args.database.join("opts.k2d"); let idx_opts = IndexOptions::read_index_options(options_filename)?; if args.paired_end_processing && !args.single_file_pairs && args.input_files.len() % 2 != 0 { @@ -374,13 +261,13 @@ pub fn run(args: Args) -> Result<()> { "Paired-end processing requires an even number of input files.", )); } - let hash_config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; println!("hash_config {:?}", hash_config); if hash_config.hash_capacity == 0 { panic!("`hash_capacity` can't be zero!"); } - println!("start..."); + println!("splitr start..."); let file_num_limit = get_file_limit(); if hash_config.partition >= file_num_limit { panic!("Exceeds File Number Limit"); diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index aadb67a..71229bb 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -1,82 +1,72 @@ -use crate::compact_hash::{Compact, Row}; -use crate::readcounts::TaxonCountersDash; +use crate::compact_hash::Compact; +use crate::readcounts::TaxonCounters; use crate::taxonomy::Taxonomy; +use crate::HitGroup; +use seqkmer::SpaceDist; use std::collections::HashMap; - -fn generate_hit_string( - count: u32, - rows: &Vec, - taxonomy: &Taxonomy, - value_mask: usize, - offset: u32, -) -> String { - let mut result = Vec::new(); - let mut last_pos = 0; - - for row in rows { - if row.kmer_id < offset || row.kmer_id >= offset + count { - continue; - } - let adjusted_pos = row.kmer_id - offset; - - let value = row.value; - let key = value.right(value_mask); - let ext_code = taxonomy.nodes[key as usize].external_id; - - if last_pos == 0 && adjusted_pos > 0 { - result.push((0, adjusted_pos)); // 在开始处添加0 - } else if adjusted_pos - last_pos > 1 { - result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 - } - if let Some(last) = result.last_mut() { - if last.0 == ext_code { - last.1 += 1; - last_pos = adjusted_pos; - continue; - } - } - - // 添加当前key的计数 - result.push((ext_code, 1)); - last_pos = adjusted_pos; - } - - // 填充尾随0 - if last_pos < count - 1 { - if last_pos == 0 { - result.push((0, count - last_pos)); - } else { - result.push((0, count - last_pos - 1)); - } - } - - result - .iter() - .map(|i| format!("{}:{}", i.0, i.1)) - .collect::>() - .join(" ") -} - -pub fn trim_pair_info(id: &str) -> String { - let sz = id.len(); - if sz <= 2 { - return id.to_string(); - } - if id.ends_with("/1") || id.ends_with("/2") { - return id[0..sz - 2].to_string(); - } - id.to_string() -} +use std::sync::atomic::{AtomicUsize, Ordering}; + +// fn generate_hit_string( +// count: usize, +// rows: &Vec, +// taxonomy: &Taxonomy, +// value_mask: usize, +// offset: usize, +// ) -> String { +// let mut result = Vec::new(); +// let mut last_pos = 0; + +// for row in rows { +// let sort = row.kmer_id as usize; +// if sort < offset || sort >= offset + count { +// continue; +// } +// let adjusted_pos = row.kmer_id as usize - offset; + +// let value = row.value; +// let key = value.right(value_mask); +// let ext_code = taxonomy.nodes[key as usize].external_id; + +// if last_pos == 0 && adjusted_pos > 0 { +// result.push((0, adjusted_pos)); // 在开始处添加0 +// } else if adjusted_pos - last_pos > 1 { +// result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 +// } +// if let Some(last) = result.last_mut() { +// if last.0 == ext_code { +// last.1 += 1; +// last_pos = adjusted_pos; +// continue; +// } +// } + +// // 添加当前key的计数 +// result.push((ext_code, 1)); +// last_pos = adjusted_pos; +// } + +// // 填充尾随0 +// if last_pos < count - 1 { +// if last_pos == 0 { +// result.push((0, count - last_pos)); +// } else { +// result.push((0, count - last_pos - 1)); +// } +// } + +// result +// .iter() +// .map(|i| format!("{}:{}", i.0, i.1)) +// .collect::>() +// .join(" ") +// } // &HashMap, pub fn resolve_tree( hit_counts: &HashMap, taxonomy: &Taxonomy, - total_minimizers: usize, - confidence_threshold: f64, + required_score: u64, ) -> u32 { - let required_score = (confidence_threshold * total_minimizers as f64).ceil() as u64; - let mut max_taxon = 0u32; let mut max_score = 0; @@ -115,53 +105,131 @@ pub fn resolve_tree( max_taxon } -pub fn add_hitlist_string( - rows: &Vec, +// pub fn add_hitlist_string( +// rows: &Vec, +// value_mask: usize, +// kmer_count1: usize, +// kmer_count2: Option, +// taxonomy: &Taxonomy, +// ) -> String { +// let result1 = generate_hit_string(kmer_count1, &rows, taxonomy, value_mask, 0); +// if let Some(count) = kmer_count2 { +// let result2 = generate_hit_string(count, &rows, taxonomy, value_mask, kmer_count1); +// format!("{} |:| {}", result1, result2) +// } else { +// format!("{}", result1) +// } +// } + +// pub fn count_values( +// rows: &Vec, +// value_mask: usize, +// kmer_count1: u32, +// ) -> (HashMap, TaxonCountersDash, usize) { +// let mut counts = HashMap::new(); + +// let mut hit_count: usize = 0; + +// let mut last_row: Row = Row::new(0, 0, 0); +// let cur_taxon_counts = TaxonCountersDash::new(); + +// for row in rows { +// let value = row.value; +// let key = value.right(value_mask); +// *counts.entry(key).or_insert(0) += 1; + +// // 如果切换到第2条seq,就重新计算 +// if last_row.kmer_id < kmer_count1 && row.kmer_id > kmer_count1 { +// last_row = Row::new(0, 0, 0); +// } +// if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { +// cur_taxon_counts +// .entry(key as u64) +// .or_default() +// .add_kmer(value as u64); +// hit_count += 1; +// } + +// last_row = *row; +// } + +// (counts, cur_taxon_counts, hit_count) +// } + +fn stat_hits<'a>( + hits: &HitGroup, + counts: &mut HashMap, value_mask: usize, - kmer_count1: u32, - kmer_count2: Option, taxonomy: &Taxonomy, + cur_taxon_counts: &mut TaxonCounters, ) -> String { - let result1 = generate_hit_string(kmer_count1, &rows, taxonomy, value_mask, 0); - if let Some(count) = kmer_count2 { - let result2 = generate_hit_string(count, &rows, taxonomy, value_mask, kmer_count1); - format!("{} |:| {}", result1, result2) - } else { - format!("{}", result1) - } -} - -pub fn count_values( - rows: &Vec, - value_mask: usize, - kmer_count1: u32, -) -> (HashMap, TaxonCountersDash, usize) { - let mut counts = HashMap::new(); - - let mut hit_count: usize = 0; - - let mut last_row: Row = Row::new(0, 0, 0); - let cur_taxon_counts = TaxonCountersDash::new(); - - for row in rows { + let mut space_dist = hits.range.apply(|range| SpaceDist::new(*range)); + for row in &hits.rows { let value = row.value; let key = value.right(value_mask); + *counts.entry(key).or_insert(0) += 1; - // 如果切换到第2条seq,就重新计算 - if last_row.kmer_id < kmer_count1 && row.kmer_id > kmer_count1 { - last_row = Row::new(0, 0, 0); - } - if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { - cur_taxon_counts - .entry(key as u64) - .or_default() - .add_kmer(value as u64); - hit_count += 1; - } + cur_taxon_counts + .entry(key as u64) + .or_default() + .add_kmer(value as u64); - last_row = *row; + let ext_code = taxonomy.nodes[key as usize].external_id; + let pos = row.kmer_id as usize; + space_dist.add(ext_code, pos); } - (counts, cur_taxon_counts, hit_count) + space_dist.fill_tail_with_zeros(); + space_dist.reduce_str(" |:| ", |str| str.to_string()) +} + +pub fn process_hitgroup( + hits: &HitGroup, + taxonomy: &Taxonomy, + classify_counter: &AtomicUsize, + required_score: u64, + minimum_hit_groups: usize, + value_mask: usize, +) -> (String, u64, String, TaxonCounters) { + // let value_mask = hash_config.value_mask; + + let mut cur_taxon_counts = TaxonCounters::new(); + let mut counts = HashMap::new(); + let hit_groups = hits.capacity(); + let hit_string = stat_hits( + hits, + &mut counts, + value_mask, + taxonomy, + &mut cur_taxon_counts, + ); + + // cur_counts.iter().for_each(|(key, value)| { + // cur_taxon_counts + // .entry(*key) + // .or_default() + // .merge(value) + // .unwrap(); + // }); + + let mut call = resolve_tree(&counts, taxonomy, required_score); + if call > 0 && hit_groups < minimum_hit_groups { + call = 0; + }; + + let ext_call = taxonomy.nodes[call as usize].external_id; + let clasify = if call > 0 { + classify_counter.fetch_add(1, Ordering::SeqCst); + cur_taxon_counts + .entry(call as u64) + .or_default() + .increment_read_count(); + + "C" + } else { + "U" + }; + + (clasify.to_owned(), ext_call, hit_string, cur_taxon_counts) } diff --git a/kr2r/src/compact_hash.rs b/kr2r/src/compact_hash.rs index 779a130..c4a87f1 100644 --- a/kr2r/src/compact_hash.rs +++ b/kr2r/src/compact_hash.rs @@ -186,7 +186,7 @@ pub struct HashConfig { // 为HashConfig手动实现Debug trait impl fmt::Debug for HashConfig { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("CompactHashTableConfig") + f.debug_struct("HashConfig") .field("value_mask", &self.value_mask) .field("value_bits", &self.value_bits) .field("capacity", &self.capacity) @@ -256,6 +256,10 @@ impl HashConfig { hash_key as usize % self.capacity } + pub fn compact(&self, hash_key: u64) -> (usize, u32) { + (self.index(hash_key), hash_key.left(self.value_bits) as u32) + } + pub fn slot(&self, hash_key: u64, taxid: u32) -> Slot { let idx = self.index(hash_key); Slot::::new(idx, u32::hash_value(hash_key, self.value_bits, taxid)) @@ -370,11 +374,11 @@ impl Page { pub fn find_index( &self, index: usize, - value: u64, + compacted_key: u32, value_bits: usize, value_mask: usize, ) -> u32 { - let compacted_key = value.left(value_bits) as u32; + // let compacted_key = value.left(value_bits) as u32; let mut idx = index; if idx > self.size { return u32::default(); @@ -464,9 +468,14 @@ impl CHTable { Ok(chtm) } - pub fn get_from_page(&self, indx: usize, value: u64, page_index: usize) -> u32 { + pub fn get_from_page(&self, indx: usize, compacted: u32, page_index: usize) -> u32 { if let Some(page) = self.pages.get(page_index) { - page.find_index(indx, value, self.config.value_bits, self.config.value_mask) + page.find_index( + indx, + compacted, + self.config.value_bits, + self.config.value_mask, + ) } else { 0 } diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index 5570cb5..6bf80f2 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -1,20 +1,18 @@ // 使用时需要引用模块路径 use crate::compact_hash::{Compact, HashConfig, Slot}; -use crate::mmscanner::MinimizerScanner; +// use crate::mmscanner::MinimizerScanner; use crate::taxonomy::{NCBITaxonomy, Taxonomy}; -use crate::Meros; +use seqkmer::Meros; use crate::utils::open_file; use byteorder::{LittleEndian, WriteBytesExt}; use rayon::prelude::*; -use seq_io::fasta::{Reader, Record}; -use seq_io::parallel::read_parallel; +use seqkmer::{read_parallel, FastaReader}; use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, BufWriter, Read, Result as IOResult, Write}; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering}; - // 定义每批次处理的 Cell 数量 const BATCH_SIZE: usize = 81920; @@ -190,6 +188,58 @@ pub fn get_bits_for_taxid( Ok(bits_needed_for_value.max(requested_bits_for_taxid)) } +// /// 将fna文件转换成k2格式的临时文件 +// pub fn convert_fna_to_k2_format>( +// fna_file: P, +// meros: Meros, +// taxonomy: &Taxonomy, +// id_to_taxon_map: &HashMap, +// hash_config: HashConfig, +// writers: &mut Vec>, +// chunk_size: usize, +// threads: u32, +// ) { +// let reader = Reader::from_path(fna_file).unwrap(); +// let queue_len = (threads - 2) as usize; +// let value_bits = hash_config.value_bits; +// let cell_size = std::mem::size_of::>(); + +// read_parallel( +// reader, +// threads, +// queue_len, +// |record_set| { +// let mut k2_cell_list = Vec::new(); + +// for record in record_set.into_iter() { +// if let Ok(seq_id) = record.id() { +// if let Some(ext_taxid) = id_to_taxon_map.get(seq_id) { +// let taxid = taxonomy.get_internal_id(*ext_taxid); +// for hash_key in MinimizerScanner::new(record.seq(), meros).into_iter() { +// let index: usize = hash_config.index(hash_key); +// let idx = index % chunk_size; +// let partition_index = index / chunk_size; +// let cell = Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); +// k2_cell_list.push((partition_index, cell)); +// } +// }; +// } +// } +// k2_cell_list +// }, +// |record_sets| { +// while let Some(Ok((_, k2_cell_map))) = record_sets.next() { +// for cell in k2_cell_map { +// let partition_index = cell.0; +// if let Some(writer) = writers.get_mut(partition_index) { +// writer.write_all(&cell.1.as_slice(cell_size)).unwrap(); +// } +// } +// } +// }, +// ); +// } + /// 将fna文件转换成k2格式的临时文件 pub fn convert_fna_to_k2_format>( fna_file: P, @@ -199,38 +249,59 @@ pub fn convert_fna_to_k2_format>( hash_config: HashConfig, writers: &mut Vec>, chunk_size: usize, - threads: u32, + threads: usize, ) { - let reader = Reader::from_path(fna_file).unwrap(); - let queue_len = (threads - 2) as usize; + let mut reader = FastaReader::from_path(fna_file, 1).unwrap(); let value_bits = hash_config.value_bits; let cell_size = std::mem::size_of::>(); read_parallel( - reader, + &mut reader, threads, - queue_len, - |record_set| { + &meros, + |seqs| { let mut k2_cell_list = Vec::new(); - for record in record_set.into_iter() { - if let Ok(seq_id) = record.id() { - if let Some(ext_taxid) = id_to_taxon_map.get(seq_id) { + for record in seqs { + let header = &record.header; + record.body.apply_mut(|m_iter| { + if let Some(ext_taxid) = id_to_taxon_map.get(&header.id) { let taxid = taxonomy.get_internal_id(*ext_taxid); - for hash_key in MinimizerScanner::new(record.seq(), meros).into_iter() { - let index: usize = hash_config.index(hash_key); - let idx = index % chunk_size; - let partition_index = index / chunk_size; - let cell = Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); - k2_cell_list.push((partition_index, cell)); - } - }; - } + let k2_cell: Vec<(usize, Slot)> = m_iter + .map(|(_, hash_key)| { + let index: usize = hash_config.index(hash_key); + let idx = index % chunk_size; + let partition_index = index / chunk_size; + let cell = + Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); + (partition_index, cell) + }) + .collect(); + + k2_cell_list.extend_from_slice(&k2_cell); + } + }); + + // if let Some(ext_taxid) = id_to_taxon_map.get(&record.id) { + // let taxid = taxonomy.get_internal_id(*ext_taxid); + // record + // .marker + // .fold(&mut k2_cell_list, |k2_cell_list, marker| { + // for &hash_key in marker.minimizer.iter() { + // let index: usize = hash_config.index(hash_key); + // let idx = index % chunk_size; + // let partition_index = index / chunk_size; + // let cell = + // Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); + // k2_cell_list.push((partition_index, cell)); + // } + // }); + // } } - k2_cell_list + Some(k2_cell_list) }, |record_sets| { - while let Some(Ok((_, k2_cell_map))) = record_sets.next() { + while let Some(Some(k2_cell_map)) = record_sets.next() { for cell in k2_cell_map { let partition_index = cell.0; if let Some(writer) = writers.get_mut(partition_index) { @@ -239,5 +310,6 @@ pub fn convert_fna_to_k2_format>( } } }, - ); + ) + .expect("failed"); } diff --git a/kr2r/src/kr2r_data.rs b/kr2r/src/kr2r_data.rs index b993634..06b4770 100644 --- a/kr2r/src/kr2r_data.rs +++ b/kr2r/src/kr2r_data.rs @@ -1,9 +1,9 @@ +use crate::compact_hash::Row; use crate::utils::open_file; // use crate::{Meros, CURRENT_REVCOM_VERSION}; -use crate::{ - BITS_PER_CHAR, CURRENT_REVCOM_VERSION, DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, - DEFAULT_SPACED_SEED_MASK, DEFAULT_TOGGLE_MASK, -}; +use seqkmer::Meros; +use seqkmer::OptionPair; +use seqkmer::CURRENT_REVCOM_VERSION; use std::fs::File; use std::io::{Read, Result as IoResult, Write}; use std::mem; @@ -27,68 +27,31 @@ pub fn construct_seed_template(minimizer_len: usize, minimizer_spaces: usize) -> format!("{}{}", core, spaces) } -/// minimizer config -#[derive(Copy, Debug, Clone)] -pub struct Meros { - pub k_mer: usize, - pub l_mer: usize, - pub mask: u64, - pub spaced_seed_mask: u64, - pub toggle_mask: u64, - pub min_clear_hash_value: Option, +/// 判断u64的值是否为0,并将其转换为Option类型 +pub fn u64_to_option(value: u64) -> Option { + Option::from(value).filter(|&x| x != 0) } -impl Meros { - pub fn new( - k_mer: usize, - l_mer: usize, - spaced_seed_mask: Option, - toggle_mask: Option, - min_clear_hash_value: Option, - ) -> Self { - let mut mask = 1u64; - mask <<= l_mer * BITS_PER_CHAR; - mask -= 1; +pub struct HitGroup { + pub rows: Vec, + /// example: (0..10], 左开右闭 + pub range: OptionPair<(usize, usize)>, +} - Self { - k_mer, - l_mer, - mask, - spaced_seed_mask: spaced_seed_mask.unwrap_or(DEFAULT_SPACED_SEED_MASK), - toggle_mask: toggle_mask.unwrap_or(DEFAULT_TOGGLE_MASK) & mask, - min_clear_hash_value, - } +impl HitGroup { + pub fn new(rows: Vec, range: OptionPair<(usize, usize)>) -> Self { + Self { rows, range } } - pub fn window_size(&self) -> usize { - self.k_mer - self.l_mer + pub fn capacity(&self) -> usize { + self.range.reduce(0, |acc, range| acc + range.1 - range.0) } -} - -impl Default for Meros { - fn default() -> Self { - let l_mer = DEFAULT_MINIMIZER_LENGTH as usize; - let k_mer = DEFAULT_KMER_LENGTH as usize; - let mut mask = 1u64; - mask <<= l_mer * BITS_PER_CHAR; - mask -= 1; - Self { - k_mer, - l_mer, - mask, - spaced_seed_mask: DEFAULT_SPACED_SEED_MASK, - toggle_mask: DEFAULT_TOGGLE_MASK & mask, - min_clear_hash_value: Some(0), - } + pub fn required_score(&self, confidence_threshold: f64) -> u64 { + (confidence_threshold * self.capacity() as f64).ceil() as u64 } } -/// 判断u64的值是否为0,并将其转换为Option类型 -pub fn u64_to_option(value: u64) -> Option { - Option::from(value).filter(|&x| x != 0) -} - /// 顺序不能错 #[repr(C)] #[derive(Debug)] diff --git a/kr2r/src/lib.rs b/kr2r/src/lib.rs index 6e32bfb..8f5dd6e 100644 --- a/kr2r/src/lib.rs +++ b/kr2r/src/lib.rs @@ -1,17 +1,10 @@ -pub mod kr2r_data; +mod kr2r_data; mod kv_store; -pub mod mmscanner; pub mod readcounts; pub mod report; -pub mod seq; pub mod taxonomy; pub mod utils; -mod feat; - -pub use feat::constants::*; -pub use feat::*; - pub mod db; pub use kr2r_data::*; pub use kv_store::*; diff --git a/kr2r/src/mmscanner.rs b/kr2r/src/mmscanner.rs deleted file mode 100644 index 539788a..0000000 --- a/kr2r/src/mmscanner.rs +++ /dev/null @@ -1,270 +0,0 @@ -// kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash -use crate::{ - canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, -}; -use std::collections::VecDeque; - -#[inline] -fn to_candidate_lmer(meros: &Meros, lmer: u64) -> u64 { - let mut canonical_lmer = canonical_representation(lmer, meros.l_mer); - if meros.spaced_seed_mask > 0 { - canonical_lmer &= meros.spaced_seed_mask; - } - canonical_lmer ^ meros.toggle_mask -} - -#[derive(Debug)] -struct MinimizerData { - pub pos: usize, - candidate_lmer: u64, -} - -impl MinimizerData { - fn new(candidate_lmer: u64, pos: usize) -> Self { - Self { - candidate_lmer, - pos, - } - } -} - -pub struct MinimizerWindow { - queue: VecDeque, - queue_pos: usize, - /// 窗口队列的大小 - capacity: usize, - /// 队列计数 - count: usize, -} - -impl MinimizerWindow { - fn new(capacity: usize) -> Self { - Self { - queue: VecDeque::with_capacity(capacity), - capacity, - count: 0, - queue_pos: 0, - } - } - - #[inline] - fn next(&mut self, candidate_lmer: u64) -> Option { - // 无需比较,直接返回 - if self.capacity == 1 { - return Some(candidate_lmer); - } - - let data = MinimizerData::new(candidate_lmer, self.count); - - // 移除队列中所有比当前元素大的元素的索引 - // 因为它们不可能是当前窗口的最小值 - while let Some(m_data) = self.queue.back() { - if m_data.candidate_lmer > candidate_lmer { - self.queue.pop_back(); - } else { - break; - } - } - let mut changed = false; - - if (self.queue.is_empty() && self.count >= self.capacity) || self.count == self.capacity { - changed = true - } - // 将当前元素的索引添加到队列 - self.queue.push_back(data); - - while !self.queue.is_empty() - && self.queue.front().map_or(false, |front| { - self.count >= self.capacity && front.pos < self.count - self.capacity - }) - { - self.queue.pop_front(); - changed = true; - } - - self.count += 1; - if changed { - self.queue.front().map(|front| front.candidate_lmer) - } else { - None - } - } - - fn clear(&mut self) { - self.count = 0; - self.queue_pos = 0; - self.queue.clear(); - } -} - -struct Cursor { - pos: usize, - end: usize, - inner: Vec, - capacity: usize, - value: u64, - mask: u64, - window: MinimizerWindow, -} - -impl Cursor { - fn new(meros: &Meros, size: usize) -> Self { - Self { - pos: 0, - end: size, - inner: Vec::with_capacity(meros.l_mer), - capacity: meros.l_mer, - value: 0, - mask: meros.mask, - window: MinimizerWindow::new(meros.window_size()), - } - } - - /// 每次取一个 lmer 值出来,如果为空,表示一直 seq 已处理完成 - /// 遇到换行符,就跳过. - #[inline] - fn slide(&mut self, seq: &[u8]) -> Option { - while self.pos < self.end { - let ch = seq[self.pos]; - let code = if ch == b'\n' || ch == b'\r' { - self.pos += 1; - char_to_value(seq[self.pos]) - } else { - char_to_value(ch) - }; - self.pos += 1; - if let Some(c) = code { - if let Some(lmer) = self.next_lmer(c) { - return Some(lmer); - } - } else { - self.clear(); - } - } - None - } - - fn next_lmer(&mut self, item: u64) -> Option { - self.value <<= BITS_PER_CHAR; - self.value |= item; - if self.inner.len() == self.capacity { - self.inner.remove(0); // 移除最旧的元素 - } - self.inner.push(item); // 使用 push 方法 - if self.inner.len() >= self.capacity { - self.value &= self.mask; - return Some(self.value); - } - - None - } - - #[inline] - fn next_candidate_lmer(&mut self, item: u64) -> Option { - self.window.next(item) - } - - pub fn has_next(&self) -> bool { - self.pos < self.end - } - - // 清除元素 - #[inline] - fn clear(&mut self) { - self.inner.clear(); - self.value = 0; - self.window.clear(); - } -} - -pub struct MinimizerScanner<'a> { - seq: &'a [u8], - meros: Meros, - // l_mer: usize, - cursor: Cursor, - // 存最近一个最小值 - // last_minimizer: u64, -} - -impl<'a> MinimizerScanner<'a> { - pub fn new(seq: &'a [u8], meros: Meros) -> Self { - let size: usize = seq.len(); - MinimizerScanner { - seq, - meros, - cursor: Cursor::new(&meros, size), - // last_minimizer: std::u64::MAX, - } - } - - /// 在一个序列上滑动一个光标(可能是为了找到下一个有意义的片段或窗口), - /// 并对滑动得到的片段进行某种转换或处理。如果光标无法继续滑动(例如到达序列的末尾),则返回 None。 - fn next_window(&mut self) -> Option { - self.cursor.slide(&self.seq).and_then(|lmer| { - let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); - self.cursor.next_candidate_lmer(candidate_lmer) - }) - } -} - -impl<'a> Default for MinimizerScanner<'a> { - fn default() -> Self { - let meros = Meros::default(); - let seq: &[u8] = &[]; - MinimizerScanner::new(seq, meros) - } -} - -impl<'a> Iterator for MinimizerScanner<'a> { - type Item = u64; - - fn next(&mut self) -> Option { - while self.cursor.has_next() { - if let Some(minimizer) = self.next_window() { - return Some(murmur_hash3(minimizer ^ self.meros.toggle_mask)); - // if minimizer != self.last_minimizer { - // self.last_minimizer = minimizer; - // return Some(murmur_hash3(minimizer ^ self.meros.toggle_mask)); - // } - } - } - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_minimizer() { - // 1, 2, 3, 4 - let seq: Vec = vec![1, 2, 3, 4]; - // 窗口大小 = 2 - 0 + 1 - let mut mini: MinimizerWindow = MinimizerWindow::new(1); - let mut result = vec![]; - for s in seq { - if let Some(a) = mini.next(s) { - result.push(a); - } - } - // if let Some(a) = mini.get_last_minimizer() { - // result.push(a); - // } - assert_eq!(result, [1, 2, 3, 4]); - - let seq: Vec = vec![4, 3, 5, 2, 6, 2, 1]; - // 窗口大小 = 2 - 0 + 1 - let mut mini = MinimizerWindow::new(2); - let mut result = vec![]; - for s in seq { - if let Some(a) = mini.next(s) { - result.push(a); - } - } - // if let Some(a) = mini.get_last_minimizer() { - // result.push(a); - // } - assert_eq!(result, [3, 2, 2, 2, 1]); - } -} diff --git a/kr2r/src/seq.rs b/kr2r/src/seq.rs deleted file mode 100644 index db24286..0000000 --- a/kr2r/src/seq.rs +++ /dev/null @@ -1,270 +0,0 @@ -use crate::mmscanner::MinimizerScanner; -use crate::utils::open_file; -use seq_io::fasta; -use seq_io::fasta::Record as FaRecord; -use seq_io::fastq; -use seq_io::fastq::Record as FqRecord; - -use seq_io::parallel::Reader; - -use crate::utils::is_gzipped; -use crate::Meros; -use seq_io::policy::StdPolicy; -use std::collections::HashSet; -use std::io; -use std::iter; -use std::path::Path; - -type DefaultBufPolicy = StdPolicy; -use flate2::read::GzDecoder; - -pub trait SeqX { - fn seq_x(&self, score: i32) -> Vec; -} - -impl<'a> SeqX for fastq::RefRecord<'a> { - fn seq_x(&self, score: i32) -> Vec { - if score <= 0 { - return self.seq().to_vec(); - } - - let qual = self.qual(); - self.seq() - .iter() - .zip(qual.iter()) - .map(|(&base, &qscore)| { - if (qscore as i32 - '!' as i32) < score { - b'x' - } else { - base - } - }) - .collect::>() - } -} - -impl SeqX for fastq::OwnedRecord { - fn seq_x(&self, score: i32) -> Vec { - if score <= 0 { - return self.seq().to_vec(); - } - let qual = self.qual(); - self.seq() - .iter() - .zip(qual.iter()) - .map(|(&base, &qscore)| { - if (qscore as i32 - '!' as i32) < score { - b'x' - } else { - base - } - }) - .collect::>() - } -} - -impl<'a> SeqX for fasta::RefRecord<'a> { - #[allow(unused_variables)] - fn seq_x(&self, score: i32) -> Vec { - self.seq().to_vec() - } -} - -#[derive(Hash, PartialEq, Eq, PartialOrd, Ord)] -pub struct SeqReads { - pub dna_id: String, - pub seq_paired: Vec>, -} - -pub trait SeqSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet; -} - -pub fn open_fasta_reader>( - path1: P, -) -> io::Result>> { - let mut file1 = open_file(&path1)?; - - let read1: Box = if is_gzipped(&mut file1)? { - Box::new(GzDecoder::new(file1)) - } else { - Box::new(file1) - }; - - let reader1 = fasta::Reader::new(read1); - Ok(reader1) -} - -pub struct PairFastqReader

{ - reader1: fastq::Reader, P>, - reader2: Option, P>>, -} - -impl<'a> PairFastqReader { - /// Creates a reader from a file path. - #[inline] - pub fn from_path>(path1: P, path2: Option

) -> io::Result { - // 分别打开两个文件 - let mut file1 = open_file(&path1)?; - - let read1: Box = if is_gzipped(&mut file1)? { - Box::new(GzDecoder::new(file1)) - } else { - Box::new(file1) - }; - - let reader1 = fastq::Reader::new(read1); - - let reader2 = match path2 { - Some(path2) => { - let mut file2 = open_file(path2)?; - let read2: Box = if is_gzipped(&mut file2)? { - Box::new(GzDecoder::new(file2)) - } else { - Box::new(file2) - }; - Some(fastq::Reader::new(read2)) - } - None => None, - }; - - // 使用这两个实例构造一个 PairFastqReader 对象 - Ok(PairFastqReader { reader1, reader2 }) - } - - pub fn next(&mut self) -> Option { - let ref_record1 = self - .reader1 - .next()? - .expect("fastq file error") - .to_owned_record(); - let ref_record2 = match &mut self.reader2 { - Some(reader2) => Some(reader2.next()?.expect("fastq file error").to_owned_record()), - None => None, - }; - // let ref_recrod2 = self - // .reader2 - // .next()? - // .expect("fastq file error") - // .to_owned_record(); - - Some(PairFastqRecord(ref_record1, ref_record2)) - } -} - -pub struct PairFastqRecord(pub fastq::OwnedRecord, pub Option); - -pub struct PairFastqRecordSet(fastq::RecordSet, fastq::RecordSet); - -impl<'a> iter::IntoIterator for &'a PairFastqRecordSet { - type Item = (fastq::RefRecord<'a>, Option>); - type IntoIter = PairFastqRecordSetIter<'a>; - - #[inline] - fn into_iter(self) -> Self::IntoIter { - PairFastqRecordSetIter(self.0.into_iter(), self.1.into_iter()) - } -} - -pub struct PairFastqRecordSetIter<'a>(fastq::RecordSetIter<'a>, fastq::RecordSetIter<'a>); - -impl Default for PairFastqRecordSet { - fn default() -> Self { - PairFastqRecordSet(fastq::RecordSet::default(), fastq::RecordSet::default()) - } -} - -impl<'a> Iterator for PairFastqRecordSetIter<'a> { - type Item = (fastq::RefRecord<'a>, Option>); - - #[inline] - fn next(&mut self) -> Option { - match (self.0.next(), self.1.next()) { - (Some(record1), Some(record2)) => Some((record1, Some(record2))), - (Some(record1), None) => Some((record1, None)), - _ => None, // Return None if either iterator runs out of records - } - } -} - -impl

Reader for PairFastqReader

-where - P: seq_io::policy::BufPolicy + Send, -{ - type DataSet = PairFastqRecordSet; - type Err = fastq::Error; - - #[inline] - fn fill_data(&mut self, rset: &mut PairFastqRecordSet) -> Option> { - match self.reader1.read_record_set(&mut rset.0)? { - Ok(_) => { - if let Some(ref mut reader) = &mut self.reader2 { - match reader.read_record_set(&mut rset.1)? { - Ok(_) => Some(Ok(())), - Err(_) => None, - } - } else { - Some(Ok(())) - } - } - Err(e) => { - println!("{:?}", e); - None - } - } - - // If both reads are successful, return Ok(()) - // Some(Ok(())) - } -} - -impl SeqSet for PairFastqRecordSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet { - let mut seq_pair_set = HashSet::::new(); - - for records in self.into_iter() { - let dna_id = records.0.id().unwrap_or_default().to_string(); - let seq1 = records.0.seq_x(score); - if let Some(record3) = records.1 { - let seq2 = record3.seq_x(score); - let kmers1 = MinimizerScanner::new(&seq1, meros).collect(); - let kmers2 = MinimizerScanner::new(&seq2, meros).collect(); - let seq_paired: Vec> = vec![kmers1, kmers2]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } else { - let kmers1 = MinimizerScanner::new(&seq1, meros).collect(); - let seq_paired: Vec> = vec![kmers1]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } - } - seq_pair_set - } -} - -impl SeqSet for fastq::RecordSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet { - let mut seq_pair_set = HashSet::::new(); - for records in self.into_iter() { - let dna_id = records.id().unwrap_or_default().to_string(); - let seq1 = records.seq_x(score); - let kmers1: Vec = MinimizerScanner::new(&seq1, meros).collect(); - let seq_paired: Vec> = vec![kmers1]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } - seq_pair_set - } -} - -impl SeqSet for fasta::RecordSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet { - let mut seq_pair_set = HashSet::::new(); - for records in self.into_iter() { - let dna_id = records.id().unwrap_or_default().to_string(); - let seq1 = records.seq_x(score); - let kmers1: Vec = MinimizerScanner::new(&seq1, meros).collect(); - let seq_paired: Vec> = vec![kmers1]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } - seq_pair_set - } -} diff --git a/kr2r/src/utils.rs b/kr2r/src/utils.rs index f076344..b7fa650 100644 --- a/kr2r/src/utils.rs +++ b/kr2r/src/utils.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::fs::{self, create_dir_all, File, OpenOptions}; -use std::io::{BufRead, BufReader, BufWriter, Result, Seek, Write}; +use std::io::{self, BufRead, BufReader, BufWriter, Result, Write}; use std::path::{Path, PathBuf}; use walkdir::WalkDir; @@ -69,15 +69,25 @@ pub fn expand_spaced_seed_mask(spaced_seed_mask: u64, bit_expansion_factor: u64) new_mask } -pub fn find_library_fna_files>(path: P) -> Vec { +pub fn find_files>(path: P, prefix: &str, suffix: &str) -> Vec { WalkDir::new(path) .into_iter() .filter_map(|e| e.ok()) - .filter(|e| e.path().file_name() == Some("library.fna".as_ref())) - .map(|e| e.path().to_string_lossy().into_owned()) + .filter(|e| { + e.path() + .file_name() + .and_then(|name| name.to_str()) + .map(|name| name.starts_with(prefix) && name.ends_with(suffix)) + .unwrap_or(false) + }) + .map(|e| e.path().to_path_buf()) .collect() } +pub fn find_library_fna_files>(path: P) -> Vec { + find_files(path, "library", ".fna") +} + pub fn summary_prelim_map_files>(data_dir: P) -> Result { let lib_path = data_dir.as_ref().join("library"); @@ -142,77 +152,6 @@ pub fn format_bytes(size: f64) -> String { format!("{:.2}{}", size, current_suffix) } -#[derive(Debug)] -pub enum FileFormat { - Fasta, - Fastq, -} - -use flate2::read::GzDecoder; -use std::io::{self, Read}; - -pub fn is_gzipped(file: &mut File) -> io::Result { - let mut buffer = [0; 2]; - file.read_exact(&mut buffer)?; - file.rewind()?; // 重置文件指针到开头 - Ok(buffer == [0x1F, 0x8B]) -} - -pub fn detect_file_format>(path: P) -> io::Result { - let mut file = open_file(path)?; - let read1: Box = if is_gzipped(&mut file)? { - Box::new(GzDecoder::new(file)) - } else { - Box::new(file) - }; - - let reader = BufReader::new(read1); - let mut lines = reader.lines(); - - if let Some(first_line) = lines.next() { - let line = first_line?; - - if line.starts_with('>') { - return Ok(FileFormat::Fasta); - } else if line.starts_with('@') { - let _ = lines.next(); - if let Some(third_line) = lines.next() { - let line: String = third_line?; - if line.starts_with('+') { - return Ok(FileFormat::Fastq); - } - } - } else { - return Err(io::Error::new( - io::ErrorKind::Other, - "Unrecognized fasta(fastq) file format", - )); - } - } - - Err(io::Error::new( - io::ErrorKind::Other, - "Unrecognized fasta(fastq) file format", - )) - // let mut buffer = [0; 1]; // 仅分配一个字节的缓冲区 - - // // 读取文件的第一个字节 - // let bytes_read = reader.read(&mut buffer)?; - - // if bytes_read == 0 { - // return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "Empty file")); - // } - - // match buffer[0] { - // b'>' => Ok(FileFormat::Fasta), - // b'@' => Ok(FileFormat::Fastq), - // _ => Err(io::Error::new( - // io::ErrorKind::Other, - // "Unrecognized file format", - // )), - // } -} - #[cfg(unix)] extern crate libc; @@ -324,7 +263,7 @@ pub fn find_and_sort_files( if a_idx as i32 != *num { return Err(io::Error::new( io::ErrorKind::NotFound, - "File numbers are not continuous starting from 0.", + "File numbers are not continuous starting from 1.", )); } } diff --git a/ncbi/Cargo.toml b/ncbi/Cargo.toml index 1c67f49..f7532e5 100644 --- a/ncbi/Cargo.toml +++ b/ncbi/Cargo.toml @@ -22,3 +22,4 @@ env_logger = "0.11.0" md-5 = "0.10.6" async-compression = "0.4.5" tar = "0.4" +num_cpus = "1.13.1" diff --git a/ncbi/examples/run_download.rs b/ncbi/examples/run_download.rs new file mode 100644 index 0000000..cadaf6f --- /dev/null +++ b/ncbi/examples/run_download.rs @@ -0,0 +1,56 @@ +use std::fs; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .to_path_buf(); + + // Run the NCBI binary to download files + let ncbi_binary = workspace_root.join("target/release/ncbi"); + let download_dir = workspace_root.join("downloads"); + // Ensure the download directory exists + fs::create_dir_all(&download_dir).expect("Failed to create download directory"); + + let args = vec![ + "-d".to_string(), + download_dir.to_string_lossy().to_string(), + "gen".to_string(), + "-g".to_string(), + "archaea".to_string(), + ]; + + let command_str = format!("{} {}", ncbi_binary.to_string_lossy(), args.join(" ")); + println!("Executing command: {}", command_str); + + // Run the NCBI binary to download files + let output = Command::new(&ncbi_binary) + .args(&args) + .output() + .expect("Failed to run NCBI binary"); + println!( + "NCBI binary output: {}", + String::from_utf8_lossy(&output.stdout) + ); + + let args = vec![ + "-d".to_string(), + download_dir.to_string_lossy().to_string(), + "tax".to_string(), + ]; + + let command_str = format!("{} {}", ncbi_binary.to_string_lossy(), args.join(" ")); + println!("Executing command: {}", command_str); + + // Run the NCBI binary to download files + let output = Command::new(&ncbi_binary) + .args(&args) + .output() + .expect("Failed to run NCBI binary"); + println!( + "NCBI binary output: {}", + String::from_utf8_lossy(&output.stdout) + ); +} diff --git a/ncbi/src/fna.rs b/ncbi/src/fna.rs index b78bb19..1d9c5f4 100644 --- a/ncbi/src/fna.rs +++ b/ncbi/src/fna.rs @@ -15,6 +15,7 @@ use tar::Archive; pub async fn decompress_and_extract_tar_gz( gz_path: &PathBuf, out_path: &PathBuf, + files_to_extract: Vec, ) -> std::io::Result<()> { // Open the .tar.gz file let file = File::open(gz_path).await?; @@ -32,7 +33,26 @@ pub async fn decompress_and_extract_tar_gz( // Use the tar crate to decompress the TAR archive let mut archive = Archive::new(&decompressed_data[..]); - archive.unpack(out_path)?; + // archive.unpack(out_path)?; + + // 遍历 TAR 归档中的每个条目 + for entry in archive.entries()? { + let mut entry = entry?; + let path = entry.path()?.to_string_lossy().to_string(); + + // 检查是否为需要提取的文件 + if files_to_extract.contains(&path) { + let out_file_path = out_path.join(&path); + + // 创建输出文件夹 + if let Some(parent) = out_file_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + // 解压缩并写入文件 + entry.unpack(out_file_path)?; + } + } Ok(()) } diff --git a/ncbi/src/load.rs b/ncbi/src/load.rs index 9d65ca4..4656cca 100644 --- a/ncbi/src/load.rs +++ b/ncbi/src/load.rs @@ -101,7 +101,9 @@ impl NcbiFile { NcbiFile::Summary(_) => {} NcbiFile::Genomic(_, _) => {} NcbiFile::Taxonomy(dt1, _) => { - let _ = decompress_and_extract_tar_gz(&dt1.file, &data_dir).await; + let taxo_files: Vec = + vec!["names.dmp".to_string(), "nodes.dmp".to_string()]; + decompress_and_extract_tar_gz(&dt1.file, &data_dir, taxo_files).await?; } } Ok(()) diff --git a/ncbi/src/main.rs b/ncbi/src/main.rs index e1b15b5..fe7fd99 100644 --- a/ncbi/src/main.rs +++ b/ncbi/src/main.rs @@ -52,11 +52,11 @@ fn validate_group(group: &str) -> Result { #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] enum Site { - /// 下载 genbank 资源 + /// Download genbank resources Genbank, - /// 下载 refseq 资源 + /// Download refseq resources Refseq, - /// genbank and refseq + /// Both genbank and refseq All, } @@ -76,18 +76,18 @@ impl fmt::Display for Site { #[derive(Subcommand, Debug)] enum Mode { - /// 仅检查文件的 md5 + /// Check the md5 of files only Md5, - /// 解析 genomic 文件,并且生成 library fna 文件 - /// 同时将单个fna文件拼接成group为组的总的fna格式文件, 以便于构建database + /// Parse genomic files and generate a library fna file + /// Also concatenate individual fna files into a group for building a database Fna { - /// library fna 文件存储目录,为了不和原始文件混淆 + /// Directory to store the library fna file to avoid mixing with original files #[clap(value_parser)] out_dir: Option, }, - /// 仅下载和解析 assembly 文件 + /// Download and parse assembly files only Assembly, - /// 单独下载 genomic 文件,指定 url 地址 + /// Download genomic files separately by specifying a URL Url { #[clap(value_parser)] url: String, @@ -98,15 +98,15 @@ enum Mode { #[clap( version, about = "ncbi download resource", - long_about = "从 ncbi 网站上下载 genomes 资源" + long_about = "Download genomes resources from the NCBI website" )] struct Args { - /// 构建数据库的目录 - #[arg(short, long = "db", default_value = "lib")] - database: PathBuf, + /// Directory to store downloaded files + #[arg(short, long, default_value = "lib")] + download_dir: PathBuf, - /// 下载时的并行大小 - #[arg(short, long, default_value = "8")] + /// Number of threads to use for downloading + #[arg(short, long, default_value_t = num_cpus::get() * 2)] num_threads: usize, #[command(subcommand)] @@ -115,14 +115,14 @@ struct Args { #[derive(Subcommand, Debug)] enum Commands { - /// 从 NCBI 下载 taxonomy 文件 (alias: tax) + /// Download taxonomy files from NCBI (alias: tax) #[command(alias = "tax")] Taxonomy, - /// 从 NCBI 下载 genomes 数据 (alias: gen) + /// Download genomes data from NCBI (alias: gen) #[command(alias = "gen")] Genomes { - /// 从 NCBI 哪个站点目录下载(RefSeq或GenBank) + /// Site directory to download from NCBI (RefSeq or GenBank) #[arg(long, value_enum, default_value_t = Site::Refseq)] site: Site, @@ -131,18 +131,19 @@ enum Commands { #[arg(long, default_value = "basic")] asm_level: String, - /// 从 NCBI 站点上下载某个种类的数据信息,可以是逗号分隔的多个, archaea,bacteria,viral,fungi,plant,human,protozoa,vertebrate_mammalian,vertebrate_other,invertebrate + /// Type of data to download from NCBI site, can be multiple comma-separated values + /// e.g., archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other, invertebrate #[arg(short, long, value_parser = validate_group)] group: String, - /// 子命令,使用 md5 校验和生成 fna 文件 + /// Subcommand to generate fna files using md5 checksum #[command(subcommand)] mode: Option, }, } async fn async_run(args: Args) -> Result<()> { - let db_path = utils::create_data_dir(&args.database).unwrap(); + let db_path = utils::create_data_dir(&args.download_dir).unwrap(); init_meta(&db_path).await; match args.command { @@ -245,12 +246,12 @@ async fn async_run(args: Args) -> Result<()> { }, Some(Mode::Url { url }) => { if site == Site::All { - log::error!("必须指定合适的site"); + log::error!("Must specify a suitable site"); } else { let result = task::run_download_file(&site.to_string(), &data_dir, &url).await; if result.is_err() { - log::error!("下载文件失败... {:?}", result); + log::error!("download error... {:?}", result); } } } diff --git a/ncbi/src/task.rs b/ncbi/src/task.rs index 10cf947..97538b2 100644 --- a/ncbi/src/task.rs +++ b/ncbi/src/task.rs @@ -157,14 +157,14 @@ pub async fn run_taxo(taxo_dir: &PathBuf) -> Result<()> { log::info!("download taxonomy..."); let files = [ "taxdump.tar.gz", - "accession2taxid/nucl_gb.accession2taxid.gz", - "accession2taxid/nucl_wgs.accession2taxid.gz", + // "accession2taxid/nucl_gb.accession2taxid.gz", + // "accession2taxid/nucl_wgs.accession2taxid.gz", ]; for url_path in files.iter() { let ncbi_file = NcbiFile::new_taxo(taxo_dir, &url_path).await; let result = ncbi_file.run().await; if result.is_ok() && url_path.to_string() == "taxdump.tar.gz" { - let _ = ncbi_file.decompress(taxo_dir).await; + ncbi_file.decompress(taxo_dir).await?; } } log::info!("download taxonomy finished..."); diff --git a/seqkmer/Cargo.toml b/seqkmer/Cargo.toml new file mode 100644 index 0000000..76a0d27 --- /dev/null +++ b/seqkmer/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "seqkmer" +version = "0.1.0" +edition = "2021" +authors = ["eric9n@gmail.com"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +crossbeam-channel = "0.5" +scoped_threadpool = "0.1.9" +flate2 = "1.0" + +[features] +default = ["dna"] +dna = [] +protein = [] diff --git a/seqkmer/src/fasta.rs b/seqkmer/src/fasta.rs new file mode 100644 index 0000000..b530134 --- /dev/null +++ b/seqkmer/src/fasta.rs @@ -0,0 +1,151 @@ +use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; +use crate::seq::{Base, SeqFormat, SeqHeader}; +use crate::utils::OptionPair; +use std::io::{BufRead, BufReader, Read, Result}; +use std::path::Path; + +const SEQ_LIMIT: u64 = u64::pow(2, 32); +/// FastaReader +pub struct FastaReader +where + R: Read + Send, +{ + reader: BufReader, + file_index: usize, + reads_index: usize, + header: Vec, + seq: Vec, + + // 批量读取 + batch_size: usize, +} + +impl FastaReader +where + R: Read + Send, +{ + pub fn new(reader: R, file_index: usize) -> Self { + Self::with_capacity(reader, file_index, BUFSIZE, 30) + } + + pub fn with_capacity(reader: R, file_index: usize, capacity: usize, batch_size: usize) -> Self { + assert!(capacity >= 3); + Self { + reader: BufReader::with_capacity(capacity, reader), + file_index, + reads_index: 0, + header: Vec::new(), + seq: Vec::new(), + batch_size, + } + } + + pub fn read_next_entry<'a>(&'a mut self) -> Result, &'a Vec)>> { + // 清空header和seq缓冲区 + self.header.clear(); + self.seq.clear(); + + // 读取header部分 + if self.reader.read_until(b'\n', &mut self.header)? == 0 { + return Ok(None); + } + trim_end(&mut self.header); + + // 读取seq部分 + if self.reader.read_until(b'>', &mut self.seq)? == 0 { + return Ok(None); + } + trim_end(&mut self.seq); + + // 返回header和seq的引用 + Ok(Some((&self.header, &self.seq))) + } + + pub fn read_next(&mut self) -> Result> { + // 读取fastq文件header部分 + self.header.clear(); + if self.reader.read_until(b'\n', &mut self.header)? == 0 { + return Ok(None); + } + // 读取fasta文件seq部分 + self.seq.clear(); + if self.reader.read_until(b'>', &mut self.seq)? == 0 { + return Ok(None); + } + trim_end(&mut self.seq); + Ok(Some(())) + } + + pub fn _next(&mut self) -> Result>)>> { + if self.read_next()?.is_none() { + return Ok(None); + } + + let seq_len = self.seq.len(); + // 检查seq的长度是否大于2的32次方 + if seq_len as u64 > SEQ_LIMIT { + eprintln!("Sequence length exceeds 2^32, which is not handled."); + return Ok(None); + } + + let seq_id = unsafe { + let slice = if self.header.starts_with(b">") { + &self.header[1..] + } else { + &self.header[..] + }; + + let s = std::str::from_utf8_unchecked(slice); + let first_space_index = s + .as_bytes() + .iter() + .position(|&c| c == b' ') + .unwrap_or(s.len()); + + // 直接从原始切片创建第一个单词的切片 + &s[..first_space_index] + }; + self.reads_index += 1; + + let seq_header = SeqHeader { + file_index: self.file_index, + reads_index: self.reads_index, + format: SeqFormat::Fasta, + id: seq_id.to_owned(), + }; + Ok(Some(( + seq_len, + Base::new(seq_header, OptionPair::Single(self.seq.to_owned())), + ))) + } +} + +impl FastaReader> { + #[inline] + pub fn from_path>(path: P, file_index: usize) -> Result { + let reader = dyn_reader(path)?; + Ok(Self::new(reader, file_index)) + } +} + +impl Reader for FastaReader { + fn next(&mut self) -> Result>>>> { + let mut seqs = Vec::new(); + let mut total_bytes = 0; + let max_bytes = 10 * 1024 * 1024; + + for _ in 0..self.batch_size { + if let Some((seq_len, seq)) = self._next()? { + seqs.push(seq); + total_bytes += seq_len; + if total_bytes > max_bytes { + break; + } + } else { + break; + } + } + + Ok(if seqs.is_empty() { None } else { Some(seqs) }) + } +} diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs new file mode 100644 index 0000000..218b863 --- /dev/null +++ b/seqkmer/src/fastq.rs @@ -0,0 +1,192 @@ +use crate::reader::{dyn_reader, trim_end, trim_pair_info, Reader, BUFSIZE}; +use crate::seq::{Base, SeqFormat, SeqHeader}; +use crate::utils::OptionPair; +use std::io::{BufRead, BufReader, Read, Result}; +use std::path::Path; + +struct QReader { + reader: BufReader, + quality_score: i32, + + header: Vec, + seq: Vec, + plus: Vec, + quals: Vec, +} + +impl QReader +where + R: Read + Send, +{ + pub fn with_capacity(reader: R, capacity: usize, quality_score: i32) -> Self { + assert!(capacity >= 3); + Self { + reader: BufReader::with_capacity(capacity, reader), + header: Vec::new(), + seq: Vec::new(), + plus: Vec::new(), + quals: Vec::new(), + quality_score, + } + } + + pub fn read_next(&mut self) -> Result> { + // 读取fastq文件header部分 + self.header.clear(); + if self.reader.read_until(b'\n', &mut self.header)? == 0 { + return Ok(None); + } + // 读取fastq文件seq部分 + self.seq.clear(); + if self.reader.read_until(b'\n', &mut self.seq)? == 0 { + return Ok(None); + } + trim_end(&mut self.seq); + + // 读取fastq文件+部分 + self.plus.clear(); + if self.reader.read_until(b'\n', &mut self.plus)? == 0 { + return Ok(None); + } + + // 读取fastq文件quals部分 + self.quals.clear(); + if self.reader.read_until(b'\n', &mut self.quals)? == 0 { + return Ok(None); + } + trim_end(&mut self.quals); + + if self.quality_score > 0 { + for (base, &qscore) in self.seq.iter_mut().zip(self.quals.iter()) { + if (qscore as i32 - '!' as i32) < self.quality_score { + *base = b'x'; + } + } + } + + Ok(Some(())) + } +} + +pub struct FastqReader { + inner: OptionPair>, + file_index: usize, + reads_index: usize, + // 批量读取 + batch_size: usize, +} + +impl FastqReader +where + R: Read + Send, +{ + pub fn new(readers: OptionPair, file_index: usize, quality_score: i32) -> Self { + Self::with_capacity(readers, file_index, BUFSIZE, quality_score, 30) + } + + pub fn with_capacity<'a>( + readers: OptionPair, + file_index: usize, + capacity: usize, + quality_score: i32, + batch_size: usize, + ) -> Self { + assert!(capacity >= 3); + let inner = match readers { + OptionPair::Single(reader) => { + OptionPair::Single(QReader::with_capacity(reader, capacity, quality_score)) + } + OptionPair::Pair(reader1, reader2) => OptionPair::Pair( + QReader::with_capacity(reader1, capacity, quality_score), + QReader::with_capacity(reader2, capacity, quality_score), + ), + }; + Self { + inner, + file_index, + reads_index: 0, + batch_size, + } + } + + fn create_seq_header(reader: &QReader, file_index: usize, reads_index: usize) -> SeqHeader { + let seq_id = unsafe { + let s = std::str::from_utf8_unchecked(&reader.header[1..]); + let first_space_index = s + .as_bytes() + .iter() + .position(|&c| c == b' ') + .unwrap_or(s.len()); + + // 直接从原始切片创建第一个单词的切片 + &s[..first_space_index] + }; + SeqHeader { + file_index, + reads_index, + format: SeqFormat::Fastq, + id: trim_pair_info(seq_id), + } + } + + pub fn read_next(&mut self) -> Result>>> { + match &mut self.inner { + OptionPair::Single(reader) => { + if reader.read_next()?.is_none() { + return Ok(None); + } + + self.reads_index += 1; + + let seq_header = + Self::create_seq_header(&reader, self.file_index, self.reads_index); + Ok(Some(Base::new( + seq_header, + OptionPair::Single(reader.seq.to_owned()), + ))) + } + OptionPair::Pair(reader1, reader2) => { + if reader1.read_next()?.is_none() { + return Ok(None); + } + if reader2.read_next()?.is_none() { + return Ok(None); + } + + self.reads_index += 1; + let seq_header = + Self::create_seq_header(&reader1, self.file_index, self.reads_index); + + Ok(Some(Base::new( + seq_header, + OptionPair::Pair(reader1.seq.to_owned(), reader2.seq.to_owned()), + ))) + } + } + } +} + +impl FastqReader> { + #[inline] + pub fn from_path>( + paths: OptionPair

, + file_index: usize, + quality_score: i32, + ) -> Result { + let readers = paths.map(|path| dyn_reader(path))?; + Ok(Self::new(readers, file_index, quality_score)) + } +} + +impl Reader for FastqReader +where + R: Read + Send, +{ + fn next(&mut self) -> Result>>>> { + let seqs: Vec>> = (0..self.batch_size) + .filter_map(|_| self.read_next().transpose()) + .collect::>>()?; + + Ok(Some(seqs).filter(|v| !v.is_empty())) + } +} diff --git a/seqkmer/src/fastx.rs b/seqkmer/src/fastx.rs new file mode 100644 index 0000000..3dec5b9 --- /dev/null +++ b/seqkmer/src/fastx.rs @@ -0,0 +1,45 @@ +use crate::fasta::FastaReader; +use crate::fastq::FastqReader; +use crate::reader::{detect_file_format, Reader}; +use crate::seq::{Base, SeqFormat}; +use crate::utils::OptionPair; +use std::io::Result; +use std::path::Path; + +pub struct FastxReader { + inner: R, +} + +impl FastxReader { + pub fn new(inner: R) -> Self { + Self { inner } + } +} + +impl Reader for FastxReader { + fn next(&mut self) -> Result>>>> { + self.inner.next() + } +} +impl FastxReader> { + pub fn from_paths>( + paths: OptionPair

, + file_index: usize, + quality_score: i32, + ) -> Result { + let file_format = paths.map(|path: &P| detect_file_format(path)); + + match file_format? { + OptionPair::Single(SeqFormat::Fasta) => { + let reader = FastaReader::from_path(paths.single().unwrap().as_ref(), file_index)?; + Ok(Self::new(Box::new(reader) as Box)) + } + OptionPair::Single(SeqFormat::Fastq) + | OptionPair::Pair(SeqFormat::Fastq, SeqFormat::Fastq) => { + let reader = FastqReader::from_path(paths, file_index, quality_score)?; + Ok(Self::new(Box::new(reader) as Box)) + } + _ => panic!("Unsupported file format combination"), + } + } +} diff --git a/kr2r/src/feat.rs b/seqkmer/src/feat.rs similarity index 65% rename from kr2r/src/feat.rs rename to seqkmer/src/feat.rs index 590bc81..87b753f 100644 --- a/kr2r/src/feat.rs +++ b/seqkmer/src/feat.rs @@ -119,3 +119,84 @@ pub fn canonical_representation(kmer: u64, n: usize, revcom_version: u8) -> u64 pub const DEFAULT_TOGGLE_MASK: u64 = 0xe37e28c4271b5a2d; pub const DEFAULT_SPACED_SEED_MASK: u64 = 0; pub const CURRENT_REVCOM_VERSION: u8 = 1; + +// 声明常量 +const M1: u64 = 0xff51afd7ed558ccd; +const M2: u64 = 0xc4ceb9fe1a85ec53; + +/// +/// # Examples +/// +/// ``` +/// # use kr2r::fmix64; +/// let key: u64 = 123; +/// let hash = fmix64(key); +/// assert_eq!(hash, 9208534749291869864); +/// ``` +#[inline] +pub fn fmix64(key: u64) -> u64 { + let mut k = key; + k ^= k >> 33; + k = k.wrapping_mul(M1); + k ^= k >> 33; + k = k.wrapping_mul(M2); + k ^= k >> 33; + k +} + +/// minimizer config +#[derive(Copy, Debug, Clone)] +pub struct Meros { + pub k_mer: usize, + pub l_mer: usize, + pub mask: u64, + pub spaced_seed_mask: u64, + pub toggle_mask: u64, + pub min_clear_hash_value: Option, +} + +impl Meros { + pub fn new( + k_mer: usize, + l_mer: usize, + spaced_seed_mask: Option, + toggle_mask: Option, + min_clear_hash_value: Option, + ) -> Self { + let mut mask = 1u64; + mask <<= l_mer * constants::BITS_PER_CHAR; + mask -= 1; + + Self { + k_mer, + l_mer, + mask, + spaced_seed_mask: spaced_seed_mask.unwrap_or(DEFAULT_SPACED_SEED_MASK), + toggle_mask: toggle_mask.unwrap_or(DEFAULT_TOGGLE_MASK) & mask, + min_clear_hash_value, + } + } + + pub fn window_size(&self) -> usize { + self.k_mer - self.l_mer + } +} + +impl Default for Meros { + fn default() -> Self { + let l_mer = constants::DEFAULT_MINIMIZER_LENGTH as usize; + let k_mer = constants::DEFAULT_KMER_LENGTH as usize; + let mut mask = 1u64; + mask <<= l_mer * constants::BITS_PER_CHAR; + mask -= 1; + + Self { + k_mer, + l_mer, + mask, + spaced_seed_mask: DEFAULT_SPACED_SEED_MASK, + toggle_mask: DEFAULT_TOGGLE_MASK & mask, + min_clear_hash_value: None, + } + } +} diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs new file mode 100644 index 0000000..3beae3e --- /dev/null +++ b/seqkmer/src/lib.rs @@ -0,0 +1,20 @@ +mod fasta; +mod fastq; +mod fastx; +mod feat; +mod mmscanner; +mod parallel; +mod reader; +mod seq; +mod utils; + +pub use fasta::*; +pub use fastq::*; +pub use fastx::*; +pub use feat::constants::*; +pub use feat::*; +pub use mmscanner::MinimizerIterator; +pub use parallel::*; +pub use reader::*; +pub use seq::*; +pub use utils::OptionPair; diff --git a/seqkmer/src/mmscanner.rs b/seqkmer/src/mmscanner.rs new file mode 100644 index 0000000..ce2fe70 --- /dev/null +++ b/seqkmer/src/mmscanner.rs @@ -0,0 +1,287 @@ +// kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash +use crate::seq::Base; +use crate::utils::OptionPair; +use crate::{ + canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, +}; +use std::collections::VecDeque; + +#[inline] +fn to_candidate_lmer(meros: &Meros, lmer: u64) -> u64 { + let mut canonical_lmer = canonical_representation(lmer, meros.l_mer); + if meros.spaced_seed_mask > 0 { + canonical_lmer &= meros.spaced_seed_mask; + } + canonical_lmer ^ meros.toggle_mask +} + +#[derive(Debug)] +pub struct MinimizerData { + pos: usize, + candidate_lmer: u64, +} + +impl MinimizerData { + fn new(candidate_lmer: u64, pos: usize) -> Self { + Self { + candidate_lmer, + pos, + } + } +} + +pub struct MinimizerWindow { + queue: VecDeque, + queue_pos: usize, + /// 窗口队列的大小 + capacity: usize, + /// 队列计数 + count: usize, +} + +impl MinimizerWindow { + fn new(capacity: usize) -> Self { + Self { + queue: VecDeque::with_capacity(capacity), + capacity, + count: 0, + queue_pos: 0, + } + } + + #[inline] + fn next(&mut self, candidate_lmer: u64) -> Option { + // 无需比较,直接返回 + if self.capacity == 1 { + return Some(candidate_lmer); + } + + let data = MinimizerData::new(candidate_lmer, self.count); + + // 移除队列中所有比当前元素大的元素的索引 + // 因为它们不可能是当前窗口的最小值 + while let Some(m_data) = self.queue.back() { + if m_data.candidate_lmer > candidate_lmer { + self.queue.pop_back(); + } else { + break; + } + } + let mut changed = false; + + if (self.queue.is_empty() && self.count >= self.capacity) || self.count == self.capacity { + changed = true + } + // 将当前元素的索引添加到队列 + self.queue.push_back(data); + + while !self.queue.is_empty() + && self.queue.front().map_or(false, |front| { + self.count >= self.capacity && front.pos < self.count - self.capacity + }) + { + self.queue.pop_front(); + changed = true; + } + + self.count += 1; + if changed { + self.queue.front().map(|front| front.candidate_lmer) + } else { + None + } + } + + fn clear(&mut self) { + self.count = 0; + self.queue_pos = 0; + self.queue.clear(); + } +} + +#[derive(Clone, Copy)] +pub struct Cursor { + pos: usize, + capacity: usize, + value: u64, + mask: u64, +} + +impl Cursor { + fn new(capacity: usize, mask: u64) -> Self { + Self { + pos: 0, + value: 0, + capacity, + mask, + } + } + + fn next_lmer(&mut self, item: u64) -> Option { + self.value = ((self.value << BITS_PER_CHAR) | item) & self.mask; + // 更新当前位置 + self.pos += 1; + // 检查是否达到了容量 + if self.pos >= self.capacity { + return Some(self.value); + } + None + } + + // 清除元素 + #[inline] + fn clear(&mut self) { + self.pos = 0; + self.value = 0; + } +} + +pub struct MinimizerIterator<'a> { + cursor: Cursor, + window: MinimizerWindow, + seq: &'a [u8], + meros: &'a Meros, + pos: usize, + end: usize, + pub size: usize, +} + +impl<'a> MinimizerIterator<'a> { + pub fn new(seq: &'a [u8], cursor: Cursor, window: MinimizerWindow, meros: &'a Meros) -> Self { + MinimizerIterator { + cursor, + window, + seq, + meros, + pos: 0, + size: 0, + end: seq.len(), + } + } + + fn clear_state(&mut self) { + self.cursor.clear(); + self.window.clear(); + } + + pub fn seq_size(&self) -> usize { + self.end + } +} + +impl<'a> Iterator for MinimizerIterator<'a> { + type Item = (usize, u64); + + fn next(&mut self) -> Option { + // self.sequence + // .iter() + // .filter_map(|&ch| { + // if ch == b'\n' || ch == b'\r' { + // None + // } else { + // match char_to_value(ch) { + // Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { + // let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); + // self.window + // .next(candidate_lmer) + // .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) + // }), + // None => { + // self.clear_state(); + // None + // } + // } + // } + // }) + // .next() + while self.pos < self.end { + let ch = self.seq[self.pos]; + self.pos += 1; + if ch == b'\n' || ch == b'\r' { + continue; + } else { + let data = match char_to_value(ch) { + Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { + let candidate_lmer = to_candidate_lmer(&self.meros, lmer); + self.window + .next(candidate_lmer) + .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) + }), + None => { + self.clear_state(); + None + } + }; + if data.is_some() { + self.size += 1; + return Some((self.size, data.unwrap())); + } + } + } + None + } +} + +impl<'a> Base> { + pub fn seq_size_str(&self) -> OptionPair { + self.body.apply(|m_iter| m_iter.seq_size().to_string()) + } + + pub fn fmt_seq_size(&self) -> String { + self.body + .reduce_str("|", |m_iter| m_iter.seq_size().to_string()) + } + + pub fn fmt_size(&self) -> String { + self.body.reduce_str("|", |m_iter| m_iter.size.to_string()) + } + + pub fn fold(&mut self, mut f: F) -> Vec + where + F: FnMut(&mut Vec, &mut MinimizerIterator<'a>, usize) -> usize, + T: Clone, + { + let mut init = Vec::new(); + match &mut self.body { + OptionPair::Single(m_iter) => { + f(&mut init, m_iter, 0); + } + OptionPair::Pair(m_iter1, m_iter2) => { + let offset = f(&mut init, m_iter1, 0); + f(&mut init, m_iter2, offset); + } + } + init + } + + pub fn range(&self) -> OptionPair<(usize, usize)> { + match &self.body { + OptionPair::Single(m_iter) => OptionPair::Single((0, m_iter.size)), + OptionPair::Pair(m_iter1, m_iter2) => { + let size1 = m_iter1.size; + OptionPair::Pair((0, size1), (size1, m_iter2.size + size1)) + } + } + } +} + +pub fn scan_sequence<'a>( + sequence: &'a Base>, + meros: &'a Meros, +) -> Base> { + let func = |seq: &'a Vec| { + let cursor = Cursor::new(meros.l_mer, meros.mask); + let window = MinimizerWindow::new(meros.window_size()); + MinimizerIterator::new(seq, cursor, window, meros) + }; + + match &sequence.body { + OptionPair::Pair(seq1, seq2) => Base::new( + sequence.header.clone(), + OptionPair::Pair(func(&seq1), func(&seq2)), + ), + OptionPair::Single(seq1) => { + Base::new(sequence.header.clone(), OptionPair::Single(func(&seq1))) + } + } +} diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs new file mode 100644 index 0000000..ab09045 --- /dev/null +++ b/seqkmer/src/parallel.rs @@ -0,0 +1,100 @@ +use crate::mmscanner::{scan_sequence, MinimizerIterator}; +use crate::reader::Reader; +use crate::seq::{Base, SeqFormat}; +use crate::{detect_file_format, FastaReader, FastqReader, Meros}; +use crossbeam_channel::{bounded, Receiver}; +use scoped_threadpool::Pool; +use std::io::Result; +use std::sync::Arc; + +pub struct ParallelResult

+where + P: Send, +{ + recv: Receiver

, +} + +impl

ParallelResult

+where + P: Send, +{ + #[inline] + pub fn next(&mut self) -> Option

{ + self.recv.recv().ok() + } +} + +pub fn create_reader( + file_pair: &[String], + file_index: usize, + score: i32, +) -> Result> { + // let mut files_iter = file_pair.iter(); + let paths = crate::OptionPair::from_slice(file_pair); + + match detect_file_format(&file_pair[0])? { + SeqFormat::Fastq => Ok(Box::new(FastqReader::from_path(paths, file_index, score)?)), + SeqFormat::Fasta => Ok(Box::new(FastaReader::from_path(&file_pair[0], file_index)?)), + } +} + +pub fn read_parallel( + reader: &mut R, + n_threads: usize, + meros: &Meros, + work: W, + func: F, +) -> Result<()> +where + R: Reader, + O: Send, + Out: Send + Default, + W: Send + Sync + Fn(&mut Vec>) -> Option, + F: FnOnce(&mut ParallelResult>) -> Out + Send, +{ + assert!(n_threads > 2); + let buffer_len = n_threads + 2; + let (sender, receiver) = bounded::>>>(buffer_len); + let (done_send, done_recv) = bounded::>(buffer_len); + let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver + let done_send = Arc::new(done_send); + let mut pool = Pool::new(n_threads as u32); + + let mut parallel_result = ParallelResult { recv: done_recv }; + + pool.scoped(|pool_scope| { + // 生产者线程 + pool_scope.execute(move || { + while let Ok(Some(seqs)) = reader.next() { + sender.send(seqs).expect("Failed to send sequences"); + } + }); + + // 消费者线程 + for _ in 0..n_threads - 2 { + let receiver = Arc::clone(&receiver); + let work = &work; + let done_send = Arc::clone(&done_send); + pool_scope.execute(move || { + while let Ok(mut seqs) = receiver.recv() { + let mut markers: Vec>> = seqs + .iter_mut() + .map(|seq| scan_sequence(seq, &meros)) + .collect(); + let output = work(&mut markers); + done_send.send(output).expect("Failed to send outputs"); + } + }); + } + + // 引用计数减掉一个,这样都子线程结束时, done_send还能完全释放 + drop(done_send); + pool_scope.execute(move || { + let _ = func(&mut parallel_result); + }); + + pool_scope.join_all(); + }); + + Ok(()) +} diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs new file mode 100644 index 0000000..77309d7 --- /dev/null +++ b/seqkmer/src/reader.rs @@ -0,0 +1,202 @@ +use crate::seq::{Base, SeqFormat}; +use crate::utils::OptionPair; +use flate2::read::GzDecoder; +use std::fs::File; +use std::io::{self, BufRead, BufReader, Read, Result, Seek}; +use std::path::Path; + +pub(crate) fn dyn_reader>(path: P) -> Result> { + let mut file = open_file(path)?; + if is_gzipped(&mut file)? { + let decoder = GzDecoder::new(file); + Ok(Box::new(decoder)) + } else { + Ok(Box::new(file)) + } +} + +pub(crate) fn is_gzipped(file: &mut File) -> Result { + let mut buffer = [0; 2]; + file.read_exact(&mut buffer)?; + file.rewind()?; // 重置文件指针到开头 + Ok(buffer == [0x1F, 0x8B]) +} + +pub fn trim_pair_info(id: &str) -> String { + let sz = id.len(); + if sz <= 2 { + return id.to_string(); + } + if id.ends_with("/1") || id.ends_with("/2") { + return id[0..sz - 2].to_string(); + } + id.to_string() +} + +pub fn open_file>(path: P) -> Result { + File::open(&path).map_err(|e| { + if e.kind() == io::ErrorKind::NotFound { + io::Error::new(e.kind(), format!("File not found: {:?}", path.as_ref())) + } else { + e + } + }) +} + +pub(crate) fn detect_file_format>(path: P) -> io::Result { + // let mut file = open_file(path)?; + let read1: Box = dyn_reader(path)?; + let reader = BufReader::new(read1); + let mut lines = reader.lines(); + + if let Some(first_line) = lines.next() { + let line = first_line?; + + if line.starts_with('>') { + return Ok(SeqFormat::Fasta); + } else if line.starts_with('@') { + let _ = lines.next(); + if let Some(third_line) = lines.next() { + let line: String = third_line?; + if line.starts_with('+') { + return Ok(SeqFormat::Fastq); + } + } + } else { + return Err(io::Error::new( + io::ErrorKind::Other, + "Unrecognized fasta(fastq) file format", + )); + } + } + + Err(io::Error::new( + io::ErrorKind::Other, + "Unrecognized fasta(fastq) file format", + )) +} + +pub(crate) fn trim_end(buffer: &mut Vec) { + while let Some(&b'\n' | &b'\r' | &b'>' | &b'@') = buffer.last() { + buffer.pop(); + } +} + +pub const BUFSIZE: usize = 16 * 1024 * 1024; + +pub trait Reader: Send { + fn next(&mut self) -> Result>>>>; +} + +impl Reader for Box { + fn next(&mut self) -> Result>>>> { + (**self).next() + } +} + +#[derive(Debug)] +pub struct PosData { + /// 外部 taxonomy id + pub ext_code: u64, + /// 连续命中次数 + pub count: usize, +} + +impl PosData { + pub fn new(ext_code: u64, count: usize) -> Self { + Self { ext_code, count } + } +} + +impl fmt::Display for PosData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}", self.ext_code, self.count) + } +} + +use std::fmt; + +#[derive(Debug)] +pub struct SpaceDist { + pub value: Vec, + /// example: (0, 10], 左开右闭 + pub range: (usize, usize), + pos: usize, +} + +impl SpaceDist { + pub fn new(range: (usize, usize)) -> Self { + Self { + value: Vec::new(), + range, + pos: range.0, + } + } + + fn fill_with_zeros(&mut self, gap: usize) { + if gap > 0 { + self.value.push(PosData::new(0, gap)); + } + } + + pub fn add(&mut self, ext_code: u64, pos: usize) { + if pos <= self.pos || pos > self.range.1 { + return; // 早期返回,不做任何处理 + } + let gap = pos - self.pos - 1; + + if gap > 0 { + self.fill_with_zeros(gap); + } + + if let Some(last) = self.value.last_mut() { + if last.ext_code == ext_code { + last.count += 1; + } else { + self.value.push(PosData::new(ext_code, 1)); + } + } else { + self.value.push(PosData::new(ext_code, 1)); + } + self.pos = pos; + } + + /// Fills the end of the distribution with zeros if there is remaining space. + pub fn fill_tail_with_zeros(&mut self) { + if self.pos < self.range.1 { + self.fill_with_zeros(self.range.1 - self.pos); + self.pos = self.range.1; + } + } +} + +impl fmt::Display for SpaceDist { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for (i, data) in self.value.iter().enumerate() { + if i > 0 { + write!(f, " ")?; + } + write!(f, "{}", data)?; + } + write!(f, "") + } +} + +impl OptionPair { + pub fn add(&mut self, ext_code: u64, pos: usize) { + match self { + OptionPair::Single(sd) => sd.add(ext_code, pos), + OptionPair::Pair(sd1, sd2) => { + if pos > sd1.range.1 { + sd2.add(ext_code, pos) + } else { + sd1.add(ext_code, pos) + } + } + } + } + + pub fn fill_tail_with_zeros(&mut self) { + self.apply_mut(|sd| sd.fill_tail_with_zeros()); + } +} diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs new file mode 100644 index 0000000..aacfbaf --- /dev/null +++ b/seqkmer/src/seq.rs @@ -0,0 +1,37 @@ +use crate::utils::OptionPair; + +#[derive(Debug, Clone, PartialEq, Eq, Copy)] +pub enum SeqFormat { + Fasta, + Fastq, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SeqHeader { + pub id: String, + pub file_index: usize, + pub reads_index: usize, + pub format: SeqFormat, +} + +#[derive(Debug)] +pub struct Base { + pub header: SeqHeader, + pub body: OptionPair, +} + +impl Base { + pub fn new(header: SeqHeader, body: OptionPair) -> Self { + Self { header, body } + } + + pub fn map(&self, mut f: F) -> Result, E> + where + F: FnMut(&T) -> Result, + { + self.body.map(|t| f(&t)).map(|body| Base { + header: self.header.clone(), + body, + }) + } +} diff --git a/seqkmer/src/utils.rs b/seqkmer/src/utils.rs new file mode 100644 index 0000000..f5e77bc --- /dev/null +++ b/seqkmer/src/utils.rs @@ -0,0 +1,106 @@ +#[derive(Debug, Clone)] +pub enum OptionPair { + Single(T), + Pair(T, T), +} + +impl OptionPair { + pub fn single(&self) -> Option<&T> { + match self { + OptionPair::Single(value) => Some(value), + _ => None, + } + } + + // 它接受一个泛型闭包 F,并返回一个新的 OptionPair + pub fn map(&self, mut f: F) -> Result, E> + where + F: FnMut(&T) -> Result, + { + match self { + OptionPair::Single(t) => f(t).map(OptionPair::Single), + OptionPair::Pair(t1, t2) => { + let u1 = f(t1)?; + let u2 = f(t2)?; + Ok(OptionPair::Pair(u1, u2)) + } + } + } + + // pub fn concat(&self, init: &mut U, mut f: F) -> V + // where + // F: FnMut(&mut U, &T) -> V, + // { + // match self { + // OptionPair::Single(t) => f(init, t), + // OptionPair::Pair(t1, t2) => { + // f(init, t1); + // f(init, t2) + // } + // } + // } + pub fn reduce(&self, init: U, mut f: F) -> U + where + F: FnMut(U, &T) -> U, + { + match self { + OptionPair::Single(t) => f(init, t), + OptionPair::Pair(t1, t2) => { + let result = f(init, t1); + f(result, t2) + } + } + } + + pub fn reduce_str(&self, sep: &str, mut f: F) -> String + where + F: FnMut(&T) -> String, + { + self.reduce(String::new(), |acc, t| { + if acc.is_empty() { + f(t) + } else { + format!("{}{}{}", acc, sep, f(t)) + } + }) + } + + pub fn apply(&self, mut f: F) -> OptionPair + where + F: FnMut(&T) -> U, + { + match self { + OptionPair::Single(t) => OptionPair::Single(f(t)), + OptionPair::Pair(t1, t2) => OptionPair::Pair(f(t1), f(t2)), + } + } + + pub fn apply_mut(&mut self, mut f: F) -> OptionPair + where + F: FnMut(&mut T) -> U, + { + match self { + OptionPair::Single(t) => OptionPair::Single(f(t)), + OptionPair::Pair(t1, t2) => OptionPair::Pair(f(t1), f(t2)), + } + } +} + +impl OptionPair { + pub fn from_slice(slice: &[T]) -> OptionPair { + match slice { + [a, b] => OptionPair::Pair(a.clone(), b.clone()), + [a] => OptionPair::Single(a.clone()), + _ => unreachable!(), + } + } +} + +impl From<(T, Option)> for OptionPair { + fn from(tuple: (T, Option)) -> Self { + match tuple { + (a, Some(b)) => OptionPair::Pair(a, b), + (a, None) => OptionPair::Single(a), + } + } +}