From 98807bae34eb08dd8c8e656593400b94da8802ad Mon Sep 17 00:00:00 2001 From: dagou Date: Fri, 14 Jun 2024 21:36:01 +0800 Subject: [PATCH 01/18] bug fix --- Cargo.toml | 4 +- kr2r/Cargo.toml | 1 + kr2r/src/bin/classify.rs | 198 ++++++++++++++++++++++++++++++-------- kr2r/src/compact_hash.rs | 2 +- seqkmer/Cargo.toml | 18 ++++ seqkmer/src/fastq.rs | 116 ++++++++++++++++++++++ seqkmer/src/feat.rs | 202 +++++++++++++++++++++++++++++++++++++++ seqkmer/src/lib.rs | 8 ++ seqkmer/src/mmscanner.rs | 182 +++++++++++++++++++++++++++++++++++ seqkmer/src/parallel.rs | 83 ++++++++++++++++ seqkmer/src/reader.rs | 44 +++++++++ seqkmer/src/seq.rs | 14 +++ 12 files changed, 831 insertions(+), 41 deletions(-) create mode 100644 seqkmer/Cargo.toml create mode 100644 seqkmer/src/fastq.rs create mode 100644 seqkmer/src/feat.rs create mode 100644 seqkmer/src/lib.rs create mode 100644 seqkmer/src/mmscanner.rs create mode 100644 seqkmer/src/parallel.rs create mode 100644 seqkmer/src/reader.rs create mode 100644 seqkmer/src/seq.rs diff --git a/Cargo.toml b/Cargo.toml index 2258ff0..1f8cc06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [workspace] members = [ "ncbi", - "kr2r" -] + "kr2r", + "seqkmer"] resolver = "2" diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml index 3d958ac..791d524 100644 --- a/kr2r/Cargo.toml +++ b/kr2r/Cargo.toml @@ -18,6 +18,7 @@ double_hashing = [] exact_counting = [] [dependencies] +seqkmer = { version = "0.1.0", path = "../seqkmer" } clap = { version = "4.4.10", features = ["derive"] } seq_io = "0.3.2" hyperloglogplus = { version = "*", features = ["const-loop"] } diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index af00c35..7c8e890 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -21,6 +21,11 @@ use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Instant; +use seqkmer::fastq::FastqReader; +use seqkmer::parallel::read_parallel as s_parallel; +use seqkmer::reader::Reader; +use seqkmer::Meros as SMeros; + #[derive(Parser, Debug, Clone)] #[clap( version, @@ -182,6 +187,96 @@ fn process_record( output_line } +fn process_seq1( + miner: Vec, + hash_config: &HashConfig, + chtable: &CHTable, + offset: u32, +) -> (u32, Vec) { + let chunk_size = hash_config.hash_capacity; + let value_bits = hash_config.value_bits; + + let mut rows = Vec::new(); + let mut kmer_count = 0; + for (sort, hash_key) in miner.into_iter().enumerate() { + let idx = hash_config.index(hash_key); + let partition_index = idx / chunk_size; + let index = idx % chunk_size; + let taxid = chtable.get_from_page(index, hash_key, partition_index + 1); + if taxid > 0 { + let compacted_key = hash_key.left(value_bits) as u32; + let high = u32::combined(compacted_key, taxid, value_bits); + let row = Row::new(high, 0, sort as u32 + 1 + offset); + rows.push(row); + } + kmer_count += 1; + } + (kmer_count, rows) +} + +fn process_record1( + dna_id: String, + seq1: Vec, + seq2: Option>, + args: &Args, + taxonomy: &Taxonomy, + meros: Meros, + chtable: &CHTable, + hash_config: &HashConfig, + cur_taxon_counts: &TaxonCountersDash, + classify_counter: &AtomicUsize, +) -> String { + let value_mask = hash_config.value_mask; + let mut seq_len_str = String::new(); + let seq1_len = seq1.len(); + seq_len_str.push_str(&seq1_len.to_string()); + + let (kmer_count1, mut rows) = process_seq1(seq1, &hash_config, chtable, 0); + let kmer_count2 = if let Some(seq) = seq2 { + let scan2 = MinimizerScanner::new(&seq, meros); + let (kmer_count2, rows2) = process_seq(scan2, &hash_config, chtable, kmer_count1); + rows.extend_from_slice(&rows2); + seq_len_str.push_str(format!("|{}", seq.len()).as_str()); + Some(kmer_count2) + } else { + None + }; + let total_kmers: usize = (kmer_count1 + kmer_count2.unwrap_or(0)) as usize; + let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, kmer_count1); + let hit_string = add_hitlist_string(&rows, value_mask, kmer_count1, kmer_count2, taxonomy); + let mut call = resolve_tree(&counts, taxonomy, total_kmers, args.confidence_threshold); + if call > 0 && hit_groups < args.minimum_hit_groups { + call = 0; + }; + + cur_counts.iter().for_each(|entry| { + cur_taxon_counts + .entry(*entry.key()) + .or_default() + .merge(entry.value()) + .unwrap(); + }); + + let ext_call = taxonomy.nodes[call as usize].external_id; + let clasify = if call > 0 { + classify_counter.fetch_add(1, Ordering::SeqCst); + cur_taxon_counts + .entry(call as u64) + .or_default() + .increment_read_count(); + + "C" + } else { + "U" + }; + // 使用锁来同步写入 + let output_line = format!( + "{}\t{}\t{}\t{}\t{}\n", + clasify, dna_id, ext_call, seq_len_str, hit_string + ); + output_line +} + fn process_fasta_file( args: &Args, meros: Meros, @@ -314,45 +409,72 @@ fn process_fastq_file( let sequence_count = AtomicUsize::new(0); let classify_counter = AtomicUsize::new(0); - let reader = seq::PairFastqReader::from_path(&file1, file2.as_ref()) - .expect("Unable to create pair reader from paths"); - read_parallel( - reader, - args.num_threads as u32, - args.num_threads as usize, - |record_set| { - let mut buffer = String::new(); - - for records in record_set.into_iter() { - let dna_id = trim_pair_info(records.0.id().unwrap_or_default()); - sequence_count.fetch_add(1, Ordering::SeqCst); - let seq1: Vec = records.0.seq_x(score); - let seq2 = records.1.map(|seq| seq.seq_x(score)); - let output_line = process_record( - dna_id, - seq1, - seq2, - args, - taxonomy, - meros, - chtable, - &hash_config, - &cur_taxon_counts, - &classify_counter, - ); - - buffer.push_str(&output_line); - } - buffer - }, - |record_sets| { - while let Some(Ok((_, buffer))) = record_sets.next() { - writer - .write_all(buffer.as_bytes()) - .expect("write data error"); - } + let mut reader1 = FastqReader::from_path(&file1, 1, 0)?; + let _ = s_parallel( + &mut reader1, + 13, + 15, + None, + SMeros::default(), + |seq1, seq| { + let dna_id = trim_pair_info(&seq.id); + sequence_count.fetch_add(1, Ordering::SeqCst); + + let seq2 = None; + let output_line = process_record1( + dna_id, + seq1, + seq2, + args, + taxonomy, + meros, + chtable, + &hash_config, + &cur_taxon_counts, + &classify_counter, + ); + None }, ); + // let reader = seq::PairFastqReader::from_path(&file1, file2.as_ref()) + // .expect("Unable to create pair reader from paths"); + // read_parallel( + // reader, + // args.num_threads as u32, + // args.num_threads as usize, + // |record_set| { + // let mut buffer = String::new(); + + // for records in record_set.into_iter() { + // let dna_id = trim_pair_info(records.0.id().unwrap_or_default()); + // sequence_count.fetch_add(1, Ordering::SeqCst); + // let seq1: Vec = records.0.seq_x(score); + // let seq2 = records.1.map(|seq| seq.seq_x(score)); + // let output_line = process_record( + // dna_id, + // seq1, + // seq2, + // args, + // taxonomy, + // meros, + // chtable, + // &hash_config, + // &cur_taxon_counts, + // &classify_counter, + // ); + + // buffer.push_str(&output_line); + // } + // buffer + // }, + // |record_sets| { + // while let Some(Ok((_, buffer))) = record_sets.next() { + // writer + // .write_all(buffer.as_bytes()) + // .expect("write data error"); + // } + // }, + // ); let mut sample_taxon_counts: HashMap< u64, @@ -496,7 +618,7 @@ pub fn run(args: Args) -> Result<()> { let hash_config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; - println!("hash_config {:?}", hash_config); + println!("{:?}", hash_config); if hash_config.hash_capacity == 0 { panic!("`hash_capacity` can't be zero!"); } diff --git a/kr2r/src/compact_hash.rs b/kr2r/src/compact_hash.rs index 779a130..d7538e0 100644 --- a/kr2r/src/compact_hash.rs +++ b/kr2r/src/compact_hash.rs @@ -186,7 +186,7 @@ pub struct HashConfig { // 为HashConfig手动实现Debug trait impl fmt::Debug for HashConfig { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("CompactHashTableConfig") + f.debug_struct("HashConfig") .field("value_mask", &self.value_mask) .field("value_bits", &self.value_bits) .field("capacity", &self.capacity) diff --git a/seqkmer/Cargo.toml b/seqkmer/Cargo.toml new file mode 100644 index 0000000..03a50de --- /dev/null +++ b/seqkmer/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "seqkmer" +version = "0.1.0" +edition = "2021" +authors = ["eric9n@gmail.com"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +crossbeam-channel = "0.5" +scoped_threadpool = "0.1.9" +flate2 = "1.0" + + +[features] +default = ["dna"] +dna = [] +protein = [] diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs new file mode 100644 index 0000000..698114c --- /dev/null +++ b/seqkmer/src/fastq.rs @@ -0,0 +1,116 @@ +use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; +use crate::seq::{SeqFormat, Sequence}; +use std::io::{BufRead, BufReader, Read, Result}; +use std::path::Path; + +/// FastqReader +pub struct FastqReader { + pub reader: BufReader, + pub file_index: u64, + pub reads_index: u64, + pub seq_id: String, + + score: i32, + header: Vec, + seq: Vec, + plus: Vec, + quals: Vec, +} + +impl FastqReader +where + R: Read + Send, +{ + pub fn new(reader: R, file_index: u64, score: i32) -> Self { + Self::with_capacity(reader, file_index, BUFSIZE, score) + } + + pub fn with_capacity<'a>(reader: R, file_index: u64, capacity: usize, score: i32) -> Self { + assert!(capacity >= 3); + Self { + reader: BufReader::with_capacity(capacity, reader), + file_index, + reads_index: 0, + seq_id: String::new(), + header: Vec::new(), + seq: Vec::new(), + plus: Vec::new(), + quals: Vec::new(), + score, + } + } + + pub fn read_lines(&mut self) -> Result> { + // 读取fastq文件header部分 + self.header.clear(); + if self.reader.read_until(b'\n', &mut self.header)? == 0 { + return Ok(None); + } + // 读取fastq文件seq部分 + self.seq.clear(); + if self.reader.read_until(b'\n', &mut self.seq)? == 0 { + return Ok(None); + } + trim_end(&mut self.seq); + + // 读取fastq文件+部分 + self.plus.clear(); + if self.reader.read_until(b'\n', &mut self.plus)? == 0 { + return Ok(None); + } + + // 读取fastq文件quals部分 + self.quals.clear(); + if self.reader.read_until(b'\n', &mut self.quals)? == 0 { + return Ok(None); + } + trim_end(&mut self.quals); + + let seq_id = unsafe { + let s = std::str::from_utf8_unchecked(&self.header[1..]); + let first_space_index = s + .as_bytes() + .iter() + .position(|&c| c == b' ') + .unwrap_or(s.len()); + + // 直接从原始切片创建第一个单词的切片 + &s[..first_space_index] + }; + self.reads_index += 1; + + if self.score > 0 { + for (base, &qscore) in self.seq.iter_mut().zip(self.quals.iter()) { + if (qscore as i32 - '!' as i32) < self.score { + *base = b'x'; + } + } + } + + let sequence = Sequence { + file_index: self.file_index, + reads_index: self.reads_index, + id: seq_id.to_owned(), + seq: self.seq.to_owned(), + format: SeqFormat::FASTQ, + }; + Ok(Some(sequence)) + } +} + +impl FastqReader> { + #[inline] + pub fn from_path>(path: P, file_index: u64, score: i32) -> Result { + let reader = dyn_reader(path)?; + Ok(Self::new(reader, file_index, score)) + } +} + +impl Reader for FastqReader +where + R: Read + Send, +{ + fn next(&mut self) -> Result> { + self.read_lines() + } +} diff --git a/seqkmer/src/feat.rs b/seqkmer/src/feat.rs new file mode 100644 index 0000000..8b06118 --- /dev/null +++ b/seqkmer/src/feat.rs @@ -0,0 +1,202 @@ +#[cfg(feature = "dna")] +pub mod constants { + pub const DEFAULT_KMER_LENGTH: u64 = 35; + pub const DEFAULT_MINIMIZER_LENGTH: u8 = 31; + pub const DEFAULT_MINIMIZER_SPACES: u8 = 7; + + pub const BITS_PER_CHAR: usize = 2; +} + +#[cfg(feature = "protein")] +pub mod constants { + pub const DEFAULT_KMER_LENGTH: u64 = 15; + pub const DEFAULT_MINIMIZER_LENGTH: u8 = 12; + pub const DEFAULT_MINIMIZER_SPACES: u8 = 0; + + pub const BITS_PER_CHAR: usize = 4; +} + +#[cfg(feature = "dna")] +#[inline] +pub fn char_to_value(c: u8) -> Option { + match c { + b'A' | b'a' => Some(0x00), + b'C' | b'c' => Some(0x01), + b'G' | b'g' => Some(0x02), + b'T' | b't' => Some(0x03), + _ => None, + } +} + +#[cfg(feature = "protein")] +#[inline] +pub fn char_to_value(c: u8) -> Option<64> { + match c { + // stop codons/rare amino acids + b'*' | b'U' | b'u' | b'O' | b'o' => Some(0x00), + // alanine + b'A' | b'a' => Some(0x01), + // asparagine, glutamine, serine + b'N' | b'n' | b'Q' | b'q' | b'S' | b's' => Some(0x02), + // cysteine + b'C' | b'c' => Some(0x03), + // aspartic acid, glutamic acid + b'D' | b'd' | b'E' | b'e' => Some(0x04), + // phenylalanine + b'F' | b'f' => Some(0x05), + // glycine + b'G' | b'g' => Some(0x06), + // histidine + b'H' | b'h' => Some(0x07), + // isoleucine, leucine + b'I' | b'i' | b'L' | b'l' => Some(0x08), + // lysine + b'K' | b'k' => Some(0x09), + // proline + b'P' | b'p' => Some(0x0a), + // arginine + b'R' | b'r' => Some(0x0b), + // methionine, valine + b'M' | b'm' | b'V' | b'v' => Some(0x0c), + // threonine + b'T' | b't' => Some(0x0d), + // tryptophan + b'W' | b'w' => Some(0x0e), + // tyrosine + b'Y' | b'y' => Some(0x0f), + _ => None, + } +} + +#[inline] +fn reverse_complement(mut kmer: u64, n: usize) -> u64 { + // Reverse bits while leaving bit pairs (nucleotides) intact. + + // Swap consecutive pairs of bits + kmer = (kmer >> 2 & 0x3333333333333333) | (kmer << 2 & 0xCCCCCCCCCCCCCCCC); + + // Swap consecutive nibbles (4-bit groups) + kmer = (kmer >> 4 & 0x0F0F0F0F0F0F0F0F) | (kmer << 4 & 0xF0F0F0F0F0F0F0F0); + + // Swap consecutive bytes + kmer = (kmer >> 8 & 0x00FF00FF00FF00FF) | (kmer << 8 & 0xFF00FF00FF00FF00); + + // Swap consecutive pairs of bytes + kmer = (kmer >> 16 & 0x0000FFFF0000FFFF) | (kmer << 16 & 0xFFFF0000FFFF0000); + + // Swap the two halves of the 64-bit word + kmer = (kmer >> 32) | (kmer << 32); + + // Complement the bits, shift to the right length, and mask to get the desired length + (!kmer >> (64 - n * 2)) & ((1u64 << (n * 2)) - 1) + + // if revcom_version == 0 { + // // Complement the bits and mask to get the desired length + // !kmer & ((1u64 << (n * 2)) - 1) + // } else { + // // Complement the bits, shift to the right length, and mask to get the desired length + // (!kmer >> (64 - n * 2)) & ((1u64 << (n * 2)) - 1) + // } +} + +#[cfg(feature = "dna")] +#[inline] +pub fn canonical_representation(kmer: u64, n: usize) -> u64 { + let revcom = reverse_complement(kmer, n); + if kmer < revcom { + kmer + } else { + revcom + } +} + +#[cfg(feature = "protein")] +#[inline] +pub fn canonical_representation(kmer: u64, n: usize, revcom_version: u8) -> u64 { + kmer +} + +pub const DEFAULT_TOGGLE_MASK: u64 = 0xe37e28c4271b5a2d; +pub const DEFAULT_SPACED_SEED_MASK: u64 = 0; +pub const CURRENT_REVCOM_VERSION: u8 = 1; + +// 声明常量 +const M1: u64 = 0xff51afd7ed558ccd; +const M2: u64 = 0xc4ceb9fe1a85ec53; + +/// +/// # Examples +/// +/// ``` +/// # use kr2r::fmix64; +/// let key: u64 = 123; +/// let hash = fmix64(key); +/// assert_eq!(hash, 9208534749291869864); +/// ``` +#[inline] +pub fn fmix64(key: u64) -> u64 { + let mut k = key; + k ^= k >> 33; + k = k.wrapping_mul(M1); + k ^= k >> 33; + k = k.wrapping_mul(M2); + k ^= k >> 33; + k +} + +/// minimizer config +#[derive(Copy, Debug, Clone)] +pub struct Meros { + pub k_mer: usize, + pub l_mer: usize, + pub mask: u64, + pub spaced_seed_mask: u64, + pub toggle_mask: u64, + pub min_clear_hash_value: Option, +} + +impl Meros { + pub fn new( + k_mer: usize, + l_mer: usize, + spaced_seed_mask: Option, + toggle_mask: Option, + min_clear_hash_value: Option, + ) -> Self { + let mut mask = 1u64; + mask <<= l_mer * constants::BITS_PER_CHAR; + mask -= 1; + + Self { + k_mer, + l_mer, + mask, + spaced_seed_mask: spaced_seed_mask.unwrap_or(DEFAULT_SPACED_SEED_MASK), + toggle_mask: toggle_mask.unwrap_or(DEFAULT_TOGGLE_MASK) & mask, + min_clear_hash_value, + } + } + + pub fn window_size(&self) -> usize { + self.k_mer - self.l_mer + } +} + +impl Default for Meros { + fn default() -> Self { + let l_mer = constants::DEFAULT_MINIMIZER_LENGTH as usize; + let k_mer = constants::DEFAULT_KMER_LENGTH as usize; + let mut mask = 1u64; + mask <<= l_mer * constants::BITS_PER_CHAR; + mask -= 1; + + Self { + k_mer, + l_mer, + mask, + spaced_seed_mask: 4611686018212639539, // DEFAULT_SPACED_SEED_MASK + toggle_mask: DEFAULT_TOGGLE_MASK & mask, + min_clear_hash_value: None, + } + } +} diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs new file mode 100644 index 0000000..758e421 --- /dev/null +++ b/seqkmer/src/lib.rs @@ -0,0 +1,8 @@ +pub mod fastq; +mod feat; +pub mod mmscanner; +pub mod reader; +pub mod seq; +pub use feat::constants::*; +pub use feat::*; +pub mod parallel; diff --git a/seqkmer/src/mmscanner.rs b/seqkmer/src/mmscanner.rs new file mode 100644 index 0000000..b0d1fed --- /dev/null +++ b/seqkmer/src/mmscanner.rs @@ -0,0 +1,182 @@ +// kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash +use crate::{ + canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, +}; +use std::collections::VecDeque; + +#[inline] +fn to_candidate_lmer(meros: &Meros, lmer: u64) -> u64 { + let mut canonical_lmer = canonical_representation(lmer, meros.l_mer); + if meros.spaced_seed_mask > 0 { + canonical_lmer &= meros.spaced_seed_mask; + } + canonical_lmer ^ meros.toggle_mask +} + +#[derive(Debug)] +struct MinimizerData { + pub pos: usize, + candidate_lmer: u64, +} + +impl MinimizerData { + fn new(candidate_lmer: u64, pos: usize) -> Self { + Self { + candidate_lmer, + pos, + } + } +} + +pub struct MinimizerWindow { + queue: VecDeque, + queue_pos: usize, + /// 窗口队列的大小 + capacity: usize, + /// 队列计数 + count: usize, +} + +impl MinimizerWindow { + fn new(capacity: usize) -> Self { + Self { + queue: VecDeque::with_capacity(capacity), + capacity, + count: 0, + queue_pos: 0, + } + } + + #[inline] + fn next(&mut self, candidate_lmer: u64) -> Option { + // 无需比较,直接返回 + if self.capacity == 1 { + return Some(candidate_lmer); + } + + let data = MinimizerData::new(candidate_lmer, self.count); + + // 移除队列中所有比当前元素大的元素的索引 + // 因为它们不可能是当前窗口的最小值 + while let Some(m_data) = self.queue.back() { + if m_data.candidate_lmer > candidate_lmer { + self.queue.pop_back(); + } else { + break; + } + } + let mut changed = false; + + if (self.queue.is_empty() && self.count >= self.capacity) || self.count == self.capacity { + changed = true + } + // 将当前元素的索引添加到队列 + self.queue.push_back(data); + + while !self.queue.is_empty() + && self.queue.front().map_or(false, |front| { + self.count >= self.capacity && front.pos < self.count - self.capacity + }) + { + self.queue.pop_front(); + changed = true; + } + + self.count += 1; + if changed { + self.queue.front().map(|front| front.candidate_lmer) + } else { + None + } + } + + fn clear(&mut self) { + self.count = 0; + self.queue_pos = 0; + self.queue.clear(); + } +} + +struct Cursor { + pos: usize, + capacity: usize, + value: u64, + mask: u64, +} + +impl Cursor { + fn new(capacity: usize, mask: u64) -> Self { + Self { + pos: 0, + value: 0, + capacity, + mask, + } + } + + fn next_lmer(&mut self, item: u64) -> Option { + self.value = ((self.value << BITS_PER_CHAR) | item) & self.mask; + // 更新当前位置 + self.pos += 1; + // 检查是否达到了容量 + if self.pos >= self.capacity { + return Some(self.value); + } + None + } + + // 清除元素 + #[inline] + fn clear(&mut self) { + self.pos = 0; + self.value = 0; + } +} + +pub struct MinimizerScanner<'a> { + seq: &'a [u8], + meros: Meros, + cursor: Cursor, + window: MinimizerWindow, +} + +impl<'a> MinimizerScanner<'a> { + pub fn new(seq: &'a [u8], meros: Meros) -> Self { + MinimizerScanner { + seq, + meros, + cursor: Cursor::new(meros.l_mer, meros.mask), + window: MinimizerWindow::new(meros.window_size()), + } + } + + #[inline] + fn clear(&mut self) { + self.cursor.clear(); + self.window.clear(); + } + + pub fn iter(&mut self) -> Vec { + self.seq + .iter() + .filter_map(|&ch| { + // if ch == b'\n' || ch == b'\r' { + // None + // } else { + match char_to_value(ch) { + Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { + let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); + self.window + .next(candidate_lmer) + .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) + }), + None => { + self.clear(); + None + } + } + // } + }) + .collect() + } +} diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs new file mode 100644 index 0000000..505b3d1 --- /dev/null +++ b/seqkmer/src/parallel.rs @@ -0,0 +1,83 @@ +use crate::mmscanner::MinimizerScanner; +use crate::reader::Reader; +use crate::seq::Sequence; +use crate::Meros; +use crossbeam_channel::bounded; +use scoped_threadpool::Pool; +use std::fs::File; +use std::io::Read; +use std::io::{self, BufWriter, Result, Write}; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +pub fn read_parallel( + reader: &mut dyn Reader, + n_threads: usize, + buffer_len: usize, + output_file: Option<&PathBuf>, + meros: Meros, + work: W, +) -> Result<()> +where + R: Read + Send, + W: Send + Sync + Fn(Vec, Sequence) -> Option, +{ + assert!(n_threads <= buffer_len); + let (sender, receiver) = bounded::(buffer_len); + let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver + let mut pool = Pool::new(10); + + let counter = Arc::new(AtomicUsize::new(0)); + + let mut writer: Box = match output_file { + Some(file_name) => { + let file = File::create(file_name)?; + Box::new(BufWriter::new(file)) as Box + } + None => Box::new(io::stdout()) as Box, + }; + + let _ = pool.scoped(|pool_scope| -> Result<()> { + // 生产者线程 + pool_scope.execute(move || { + while let Some(seq) = reader.next().unwrap() { + sender.send(seq).unwrap(); + } + }); + + // 消费者线程 + for i in 0..n_threads { + let receiver = Arc::clone(&receiver); + let counter_clone = Arc::clone(&counter); + let work = &work; + + let mut temp_writer: Box = match output_file { + Some(file_name) => { + let parent_dir = file_name.parent().unwrap_or_else(|| Path::new("")); + let file_name = file_name.file_name().unwrap().to_str().unwrap(); + let filename = parent_dir.join(format!("{}.tmp.{}", file_name, i)); + let file = File::create(filename)?; + Box::new(BufWriter::new(file)) as Box + } + None => Box::new(io::stdout()) as Box, + }; + pool_scope.execute(move || { + while let Ok(seq) = receiver.recv() { + counter_clone.fetch_add(1, Ordering::Relaxed); + let mut ms = MinimizerScanner::new(&seq.seq, meros); + let res = ms.iter(); + if let Some(out) = work(res, seq) { + temp_writer + .write_all(out.as_bytes()) + .expect("write data error"); + } + } + }); + } + pool_scope.join_all(); + Ok(()) + }); + println!("counter {:?}", counter.load(Ordering::Relaxed)); + Ok(()) +} diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs new file mode 100644 index 0000000..c7bc395 --- /dev/null +++ b/seqkmer/src/reader.rs @@ -0,0 +1,44 @@ +use crate::seq::Sequence; +use flate2::read::GzDecoder; +use std::fs::File; +use std::io::{self, Read, Result, Seek}; +use std::path::Path; + +pub fn dyn_reader>(path: P) -> Result> { + let mut file = open_file(path)?; + if is_gzipped(&mut file)? { + let decoder = GzDecoder::new(file); + Ok(Box::new(decoder)) + } else { + Ok(Box::new(file)) + } +} + +pub fn is_gzipped(file: &mut File) -> Result { + let mut buffer = [0; 2]; + file.read_exact(&mut buffer)?; + file.rewind()?; // 重置文件指针到开头 + Ok(buffer == [0x1F, 0x8B]) +} + +pub fn open_file>(path: P) -> Result { + File::open(&path).map_err(|e| { + if e.kind() == io::ErrorKind::NotFound { + io::Error::new(e.kind(), format!("File not found: {:?}", path.as_ref())) + } else { + e + } + }) +} + +pub fn trim_end(buffer: &mut Vec) { + while let Some(&b'\n' | &b'\r') = buffer.last() { + buffer.pop(); + } +} + +pub const BUFSIZE: usize = 8 * 1024 * 1024; + +pub trait Reader: Send { + fn next(&mut self) -> Result>; +} diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs new file mode 100644 index 0000000..bfd4de8 --- /dev/null +++ b/seqkmer/src/seq.rs @@ -0,0 +1,14 @@ +#[derive(Debug, Clone, PartialEq, Eq, Copy)] +pub enum SeqFormat { + FASTA, + FASTQ, +} + +#[derive(Debug, Clone)] +pub struct Sequence { + pub file_index: u64, + pub reads_index: u64, + pub id: String, + pub seq: Vec, + pub format: SeqFormat, +} From d0739068b9142f2984ce398e8f4cfd34d9091ef5 Mon Sep 17 00:00:00 2001 From: dagou Date: Sat, 15 Jun 2024 21:09:09 +0800 Subject: [PATCH 02/18] seq kmer --- kr2r/src/bin/classify.rs | 386 ++++++++------------------------------- kr2r/src/kr2r_data.rs | 12 ++ seqkmer/Cargo.toml | 1 - seqkmer/src/fasta.rs | 106 +++++++++++ seqkmer/src/fastq.rs | 205 +++++++++++++++++---- seqkmer/src/lib.rs | 5 +- seqkmer/src/mmscanner.rs | 47 +++-- seqkmer/src/parallel.rs | 105 +++++------ seqkmer/src/reader.rs | 43 ++++- seqkmer/src/seq.rs | 52 +++++- 10 files changed, 535 insertions(+), 427 deletions(-) create mode 100644 seqkmer/src/fasta.rs diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index 7c8e890..ab448c6 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -1,18 +1,14 @@ use clap::Parser; use kr2r::classify::{add_hitlist_string, count_values, resolve_tree, trim_pair_info}; use kr2r::compact_hash::{CHTable, Compact, HashConfig, Row}; -use kr2r::mmscanner::MinimizerScanner; use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; -use kr2r::seq::{self, open_fasta_reader, SeqX}; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{ create_sample_file, detect_file_format, find_and_sort_files, get_lastest_file_index, FileFormat, }; -use kr2r::{IndexOptions, Meros}; -use seq_io::fasta::Record; -use seq_io::fastq::Record as FqRecord; -use seq_io::parallel::read_parallel; +use kr2r::IndexOptions; +use seqkmer::seq::BaseType; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -21,10 +17,10 @@ use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Instant; -use seqkmer::fastq::FastqReader; -use seqkmer::parallel::read_parallel as s_parallel; -use seqkmer::reader::Reader; -use seqkmer::Meros as SMeros; +use seqkmer::parallel::read_parallel; +use seqkmer::reader::SeqMer; +use seqkmer::Meros; +use seqkmer::{reader::Reader, FastaReader, FastqPairReader, FastqReader}; #[derive(Parser, Debug, Clone)] #[clap( @@ -97,17 +93,16 @@ pub struct Args { } fn process_seq( - miner: MinimizerScanner, + minimizer: &Vec, hash_config: &HashConfig, chtable: &CHTable, offset: u32, -) -> (u32, Vec) { +) -> Vec { let chunk_size = hash_config.hash_capacity; let value_bits = hash_config.value_bits; let mut rows = Vec::new(); - let mut kmer_count = 0; - for (sort, hash_key) in miner.into_iter().enumerate() { + for (sort, &hash_key) in minimizer.iter().enumerate() { let idx = hash_config.index(hash_key); let partition_index = idx / chunk_size; let index = idx % chunk_size; @@ -118,132 +113,46 @@ fn process_seq( let row = Row::new(high, 0, sort as u32 + 1 + offset); rows.push(row); } - kmer_count += 1; } - (kmer_count, rows) + rows } fn process_record( dna_id: String, - seq1: Vec, - seq2: Option>, + seq: &SeqMer, args: &Args, taxonomy: &Taxonomy, - meros: Meros, chtable: &CHTable, hash_config: &HashConfig, cur_taxon_counts: &TaxonCountersDash, classify_counter: &AtomicUsize, ) -> String { let value_mask = hash_config.value_mask; - let mut seq_len_str = String::new(); - let seq1_len = seq1.len(); - seq_len_str.push_str(&seq1_len.to_string()); - - let scan1 = MinimizerScanner::new(&seq1, meros); - let (kmer_count1, mut rows) = process_seq(scan1, &hash_config, chtable, 0); - let kmer_count2 = if let Some(seq) = seq2 { - let scan2 = MinimizerScanner::new(&seq, meros); - let (kmer_count2, rows2) = process_seq(scan2, &hash_config, chtable, kmer_count1); - rows.extend_from_slice(&rows2); - seq_len_str.push_str(format!("|{}", seq.len()).as_str()); - Some(kmer_count2) - } else { - None - }; - let total_kmers: usize = (kmer_count1 + kmer_count2.unwrap_or(0)) as usize; - let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, kmer_count1); - let hit_string = add_hitlist_string(&rows, value_mask, kmer_count1, kmer_count2, taxonomy); - let mut call = resolve_tree(&counts, taxonomy, total_kmers, args.confidence_threshold); - if call > 0 && hit_groups < args.minimum_hit_groups { - call = 0; - }; - cur_counts.iter().for_each(|entry| { - cur_taxon_counts - .entry(*entry.key()) - .or_default() - .merge(entry.value()) - .unwrap(); - }); - - let ext_call = taxonomy.nodes[call as usize].external_id; - let clasify = if call > 0 { - classify_counter.fetch_add(1, Ordering::SeqCst); - cur_taxon_counts - .entry(call as u64) - .or_default() - .increment_read_count(); - - "C" - } else { - "U" - }; - // 使用锁来同步写入 - let output_line = format!( - "{}\t{}\t{}\t{}\t{}\n", - clasify, dna_id, ext_call, seq_len_str, hit_string - ); - output_line -} - -fn process_seq1( - miner: Vec, - hash_config: &HashConfig, - chtable: &CHTable, - offset: u32, -) -> (u32, Vec) { - let chunk_size = hash_config.hash_capacity; - let value_bits = hash_config.value_bits; - - let mut rows = Vec::new(); - let mut kmer_count = 0; - for (sort, hash_key) in miner.into_iter().enumerate() { - let idx = hash_config.index(hash_key); - let partition_index = idx / chunk_size; - let index = idx % chunk_size; - let taxid = chtable.get_from_page(index, hash_key, partition_index + 1); - if taxid > 0 { - let compacted_key = hash_key.left(value_bits) as u32; - let high = u32::combined(compacted_key, taxid, value_bits); - let row = Row::new(high, 0, sort as u32 + 1 + offset); - rows.push(row); + let seq_len_str = seq.fmt_size(); + let (kmer_count1, kmer_count2, rows) = match &seq.marker { + BaseType::Single(marker) => ( + marker.size(), + 0, + process_seq(&marker.minimizer, &hash_config, chtable, 0), + ), + BaseType::Pair((marker1, marker2)) => { + let mut rows = process_seq(&marker1.minimizer, &hash_config, chtable, 0); + let seq_len1 = marker1.size(); + let rows2 = process_seq(&marker2.minimizer, &hash_config, chtable, seq_len1 as u32); + rows.extend_from_slice(&rows2); + (seq_len1, marker2.size(), rows) } - kmer_count += 1; - } - (kmer_count, rows) -} - -fn process_record1( - dna_id: String, - seq1: Vec, - seq2: Option>, - args: &Args, - taxonomy: &Taxonomy, - meros: Meros, - chtable: &CHTable, - hash_config: &HashConfig, - cur_taxon_counts: &TaxonCountersDash, - classify_counter: &AtomicUsize, -) -> String { - let value_mask = hash_config.value_mask; - let mut seq_len_str = String::new(); - let seq1_len = seq1.len(); - seq_len_str.push_str(&seq1_len.to_string()); - - let (kmer_count1, mut rows) = process_seq1(seq1, &hash_config, chtable, 0); - let kmer_count2 = if let Some(seq) = seq2 { - let scan2 = MinimizerScanner::new(&seq, meros); - let (kmer_count2, rows2) = process_seq(scan2, &hash_config, chtable, kmer_count1); - rows.extend_from_slice(&rows2); - seq_len_str.push_str(format!("|{}", seq.len()).as_str()); - Some(kmer_count2) - } else { - None }; - let total_kmers: usize = (kmer_count1 + kmer_count2.unwrap_or(0)) as usize; - let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, kmer_count1); - let hit_string = add_hitlist_string(&rows, value_mask, kmer_count1, kmer_count2, taxonomy); + let total_kmers = kmer_count1 + kmer_count2; + let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, kmer_count1 as u32); + let hit_string = add_hitlist_string( + &rows, + value_mask, + kmer_count1 as u32, + Some(kmer_count2 as u32), + taxonomy, + ); let mut call = resolve_tree(&counts, taxonomy, total_kmers, args.confidence_threshold); if call > 0 && hit_groups < args.minimum_hit_groups { call = 0; @@ -277,20 +186,16 @@ fn process_record1( output_line } -fn process_fasta_file( +fn process_fastx_file( args: &Args, meros: Meros, hash_config: HashConfig, file_index: usize, - files: &[String], + reader: &mut Box, chtable: &CHTable, taxonomy: &Taxonomy, total_taxon_counts: &mut TaxonCounters, ) -> io::Result<(usize, usize)> { - let score = args.minimum_quality_score; - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let mut writer: Box = match &args.kraken_output_dir { Some(ref file_path) => { let filename = file_path.join(format!("output_{}.txt", file_index)); @@ -301,30 +206,26 @@ fn process_fasta_file( }; let cur_taxon_counts = TaxonCountersDash::new(); + let sequence_count = AtomicUsize::new(0); let classify_counter = AtomicUsize::new(0); - let reader = open_fasta_reader(&file1).expect("Unable to create fasta reader from path"); - read_parallel( + let _ = read_parallel( reader, - args.num_threads as u32, - args.num_threads as usize, - |record_set| { + 13, + 15, + meros, + |seqs| { let mut buffer = String::new(); - - for records in record_set.into_iter() { - let dna_id = trim_pair_info(records.id().unwrap_or_default()); + for seq in seqs { + let dna_id = trim_pair_info(&seq.id); sequence_count.fetch_add(1, Ordering::SeqCst); - let seq1: Vec = records.seq_x(score); - let seq2 = None; let output_line = process_record( dna_id, - seq1, - seq2, + &seq, args, taxonomy, - meros, chtable, &hash_config, &cur_taxon_counts, @@ -333,13 +234,14 @@ fn process_fasta_file( buffer.push_str(&output_line); } - buffer + + Some(buffer) }, - |record_sets| { - while let Some(Ok((_, buffer))) = record_sets.next() { + |dataset| { + while let Ok(Some(res)) = dataset.next() { writer - .write_all(buffer.as_bytes()) - .expect("write data error"); + .write_all(res.as_bytes()) + .expect("Failed to write date to file"); } }, ); @@ -379,138 +281,6 @@ fn process_fasta_file( Ok((thread_sequences, thread_sequences - thread_classified)) } -/// fastq -fn process_fastq_file( - args: &Args, - meros: Meros, - hash_config: HashConfig, - file_index: usize, - files: &[String], - chtable: &CHTable, - taxonomy: &Taxonomy, - total_taxon_counts: &mut TaxonCounters, -) -> io::Result<(usize, usize)> { - let score = args.minimum_quality_score; - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let file2 = files_iter.next().cloned(); - - let mut writer: Box = match &args.kraken_output_dir { - Some(ref file_path) => { - let filename = file_path.join(format!("output_{}.txt", file_index)); - let file = File::create(filename)?; - Box::new(BufWriter::new(file)) as Box - } - None => Box::new(io::stdout()) as Box, - }; - - let cur_taxon_counts = TaxonCountersDash::new(); - - let sequence_count = AtomicUsize::new(0); - let classify_counter = AtomicUsize::new(0); - - let mut reader1 = FastqReader::from_path(&file1, 1, 0)?; - let _ = s_parallel( - &mut reader1, - 13, - 15, - None, - SMeros::default(), - |seq1, seq| { - let dna_id = trim_pair_info(&seq.id); - sequence_count.fetch_add(1, Ordering::SeqCst); - - let seq2 = None; - let output_line = process_record1( - dna_id, - seq1, - seq2, - args, - taxonomy, - meros, - chtable, - &hash_config, - &cur_taxon_counts, - &classify_counter, - ); - None - }, - ); - // let reader = seq::PairFastqReader::from_path(&file1, file2.as_ref()) - // .expect("Unable to create pair reader from paths"); - // read_parallel( - // reader, - // args.num_threads as u32, - // args.num_threads as usize, - // |record_set| { - // let mut buffer = String::new(); - - // for records in record_set.into_iter() { - // let dna_id = trim_pair_info(records.0.id().unwrap_or_default()); - // sequence_count.fetch_add(1, Ordering::SeqCst); - // let seq1: Vec = records.0.seq_x(score); - // let seq2 = records.1.map(|seq| seq.seq_x(score)); - // let output_line = process_record( - // dna_id, - // seq1, - // seq2, - // args, - // taxonomy, - // meros, - // chtable, - // &hash_config, - // &cur_taxon_counts, - // &classify_counter, - // ); - - // buffer.push_str(&output_line); - // } - // buffer - // }, - // |record_sets| { - // while let Some(Ok((_, buffer))) = record_sets.next() { - // writer - // .write_all(buffer.as_bytes()) - // .expect("write data error"); - // } - // }, - // ); - - let mut sample_taxon_counts: HashMap< - u64, - kr2r::readcounts::ReadCounts>, - > = HashMap::new(); - cur_taxon_counts.iter().for_each(|entry| { - total_taxon_counts - .entry(*entry.key()) - .or_default() - .merge(&entry.value()) - .unwrap(); - sample_taxon_counts - .entry(*entry.key()) - .or_default() - .merge(&entry.value()) - .unwrap(); - }); - - let thread_sequences = sequence_count.load(Ordering::SeqCst); - let thread_classified = classify_counter.load(Ordering::SeqCst); - if let Some(output) = &args.kraken_output_dir { - let filename = output.join(format!("output_{}.kreport2", file_index)); - report_kraken_style( - filename, - args.report_zero_counts, - args.report_kmer_data, - &taxonomy, - &sample_taxon_counts, - thread_sequences as u64, - (thread_sequences - thread_classified) as u64, - )?; - } - - Ok((thread_sequences, thread_sequences - thread_classified)) -} - fn process_files( args: Args, meros: Meros, @@ -542,36 +312,34 @@ fn process_files( writeln!(file_writer, "{}\t{}", file_index, file_pair.join(","))?; file_writer.flush().unwrap(); - match detect_file_format(&file_pair[0])? { + let mut files_iter = file_pair.iter(); + let file1 = files_iter.next().cloned().unwrap(); + let file2 = files_iter.next().cloned(); + let score = args.minimum_quality_score; + + let mut reader: Box = match detect_file_format(&file_pair[0])? { FileFormat::Fastq => { - let (thread_sequences, thread_unclassified) = process_fastq_file( - &args, - meros, - hash_config, - file_index, - file_pair, - chtable, - taxonomy, - &mut total_taxon_counts, - )?; - total_seqs += thread_sequences; - total_unclassified += thread_unclassified; - } - FileFormat::Fasta => { - let (thread_sequences, thread_unclassified) = process_fasta_file( - &args, - meros, - hash_config, - file_index, - file_pair, - chtable, - taxonomy, - &mut total_taxon_counts, - )?; - total_seqs += thread_sequences; - total_unclassified += thread_unclassified; + if let Some(file2) = file2 { + Box::new(FastqPairReader::from_path(file1, file2, file_index, score)?) + } else { + Box::new(FastqReader::from_path(file1, file_index, score)?) + } } - } + FileFormat::Fasta => Box::new(FastaReader::from_path(file1, file_index)?), + }; + + let (thread_sequences, thread_unclassified) = process_fastx_file( + &args, + meros, + hash_config, + file_index, + &mut reader, + chtable, + taxonomy, + &mut total_taxon_counts, + )?; + total_seqs += thread_sequences; + total_unclassified += thread_unclassified; } if let Some(output) = &args.kraken_output_dir { let filename = output.join("output.kreport2"); @@ -624,7 +392,7 @@ pub fn run(args: Args) -> Result<()> { } println!("start..."); let start = Instant::now(); - let meros = idx_opts.as_meros(); + let meros = idx_opts.as_smeros(); let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?; let chtable = CHTable::from_hash_files(hash_config, hash_files)?; diff --git a/kr2r/src/kr2r_data.rs b/kr2r/src/kr2r_data.rs index b993634..792f2c1 100644 --- a/kr2r/src/kr2r_data.rs +++ b/kr2r/src/kr2r_data.rs @@ -9,6 +9,8 @@ use std::io::{Read, Result as IoResult, Write}; use std::mem; use std::path::Path; +use seqkmer::Meros as SMeros; + pub fn parse_binary(src: &str) -> Result { u64::from_str_radix(src, 2) } @@ -180,4 +182,14 @@ impl IndexOptions { u64_to_option(self.minimum_acceptable_hash_value), ) } + + pub fn as_smeros(&self) -> SMeros { + SMeros::new( + self.k, + self.l, + u64_to_option(self.spaced_seed_mask), + u64_to_option(self.toggle_mask), + u64_to_option(self.minimum_acceptable_hash_value), + ) + } } diff --git a/seqkmer/Cargo.toml b/seqkmer/Cargo.toml index 03a50de..76a0d27 100644 --- a/seqkmer/Cargo.toml +++ b/seqkmer/Cargo.toml @@ -11,7 +11,6 @@ crossbeam-channel = "0.5" scoped_threadpool = "0.1.9" flate2 = "1.0" - [features] default = ["dna"] dna = [] diff --git a/seqkmer/src/fasta.rs b/seqkmer/src/fasta.rs new file mode 100644 index 0000000..aa0e173 --- /dev/null +++ b/seqkmer/src/fasta.rs @@ -0,0 +1,106 @@ +use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; +use crate::seq::{BaseType, SeqFormat, Sequence}; +use std::io::{BufRead, BufReader, Read, Result}; +use std::path::Path; + +/// FastaReader +pub struct FastaReader +where + R: Read + Send, +{ + reader: BufReader, + file_index: usize, + reads_index: usize, + header: Vec, + seq: Vec, +} + +impl FastaReader +where + R: Read + Send, +{ + pub fn new(reader: R, file_index: usize) -> Self { + Self::with_capacity(reader, file_index, BUFSIZE) + } + + pub fn with_capacity(reader: R, file_index: usize, capacity: usize) -> Self { + assert!(capacity >= 3); + Self { + reader: BufReader::with_capacity(capacity, reader), + file_index, + reads_index: 0, + header: Vec::new(), + seq: Vec::new(), + } + } + + pub fn read_next(&mut self) -> Result> { + // 读取fastq文件header部分 + self.header.clear(); + if self.reader.read_until(b'\n', &mut self.header)? == 0 { + return Ok(None); + } + // 读取fasta文件seq部分 + self.seq.clear(); + if self.reader.read_until(b'>', &mut self.seq)? == 0 { + return Ok(None); + } + trim_end(&mut self.seq); + Ok(Some(())) + } +} + +impl FastaReader> { + #[inline] + pub fn from_path>(path: P, file_index: usize) -> Result { + let reader = dyn_reader(path)?; + Ok(Self::new(reader, file_index)) + } +} + +fn check_sequence_length(seq: &Vec) -> bool { + let limit = u64::pow(2, 32); + // 检查seq的长度是否大于2的32次方 + (seq.len() as u64) > limit +} + +impl Reader for FastaReader { + fn next(&mut self) -> Result>> { + if self.read_next()?.is_none() { + return Ok(None); + } + + if check_sequence_length(&self.seq) { + eprintln!("Sequence length exceeds 2^32, which is not handled."); + return Ok(None); + } + + let seq_id = unsafe { + let slice = if self.header.starts_with(b">") { + &self.header[1..] + } else { + &self.header[..] + }; + + let s = std::str::from_utf8_unchecked(slice); + let first_space_index = s + .as_bytes() + .iter() + .position(|&c| c == b' ') + .unwrap_or(s.len()); + + // 直接从原始切片创建第一个单词的切片 + &s[..first_space_index] + }; + self.reads_index += 1; + + let sequence = Sequence { + file_index: self.file_index, + reads_index: self.reads_index, + id: seq_id.to_owned(), + seq: BaseType::Single(self.seq.to_owned()), + format: SeqFormat::Fasta, + }; + Ok(Some(vec![sequence])) + } +} diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs index 698114c..ab61784 100644 --- a/seqkmer/src/fastq.rs +++ b/seqkmer/src/fastq.rs @@ -1,46 +1,35 @@ use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; -use crate::seq::{SeqFormat, Sequence}; +use crate::seq::{BaseType, SeqFormat, Sequence}; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; -/// FastqReader -pub struct FastqReader { - pub reader: BufReader, - pub file_index: u64, - pub reads_index: u64, - pub seq_id: String, +struct QReader { + reader: BufReader, + quality_score: i32, - score: i32, header: Vec, seq: Vec, plus: Vec, quals: Vec, } -impl FastqReader +impl QReader where R: Read + Send, { - pub fn new(reader: R, file_index: u64, score: i32) -> Self { - Self::with_capacity(reader, file_index, BUFSIZE, score) - } - - pub fn with_capacity<'a>(reader: R, file_index: u64, capacity: usize, score: i32) -> Self { + pub fn with_capacity(reader: R, capacity: usize, quality_score: i32) -> Self { assert!(capacity >= 3); Self { reader: BufReader::with_capacity(capacity, reader), - file_index, - reads_index: 0, - seq_id: String::new(), header: Vec::new(), seq: Vec::new(), plus: Vec::new(), quals: Vec::new(), - score, + quality_score, } } - pub fn read_lines(&mut self) -> Result> { + pub fn read_next(&mut self) -> Result> { // 读取fastq文件header部分 self.header.clear(); if self.reader.read_until(b'\n', &mut self.header)? == 0 { @@ -66,8 +55,58 @@ where } trim_end(&mut self.quals); + if self.quality_score > 0 { + for (base, &qscore) in self.seq.iter_mut().zip(self.quals.iter()) { + if (qscore as i32 - '!' as i32) < self.quality_score { + *base = b'x'; + } + } + } + + Ok(Some(())) + } +} + +/// FastqReader +pub struct FastqReader { + inner: QReader, + file_index: usize, + reads_index: usize, + // 批量读取 + batch_size: usize, +} + +impl FastqReader +where + R: Read + Send, +{ + pub fn new(reader: R, file_index: usize, quality_score: i32) -> Self { + Self::with_capacity(reader, file_index, BUFSIZE, quality_score, 30) + } + + pub fn with_capacity<'a>( + reader: R, + file_index: usize, + capacity: usize, + quality_score: i32, + batch_size: usize, + ) -> Self { + assert!(capacity >= 3); + Self { + inner: QReader::with_capacity(reader, capacity, quality_score), + file_index, + reads_index: 0, + batch_size, + } + } + + pub fn read_next(&mut self) -> Result> { + if self.inner.read_next()?.is_none() { + return Ok(None); + } + let seq_id = unsafe { - let s = std::str::from_utf8_unchecked(&self.header[1..]); + let s = std::str::from_utf8_unchecked(&self.inner.header[1..]); let first_space_index = s .as_bytes() .iter() @@ -79,20 +118,12 @@ where }; self.reads_index += 1; - if self.score > 0 { - for (base, &qscore) in self.seq.iter_mut().zip(self.quals.iter()) { - if (qscore as i32 - '!' as i32) < self.score { - *base = b'x'; - } - } - } - let sequence = Sequence { file_index: self.file_index, reads_index: self.reads_index, id: seq_id.to_owned(), - seq: self.seq.to_owned(), - format: SeqFormat::FASTQ, + seq: BaseType::Single(self.inner.seq.to_owned()), + format: SeqFormat::Fastq, }; Ok(Some(sequence)) } @@ -100,17 +131,121 @@ where impl FastqReader> { #[inline] - pub fn from_path>(path: P, file_index: u64, score: i32) -> Result { + pub fn from_path>( + path: P, + file_index: usize, + quality_score: i32, + ) -> Result { let reader = dyn_reader(path)?; - Ok(Self::new(reader, file_index, score)) + Ok(Self::new(reader, file_index, quality_score)) } } -impl Reader for FastqReader +impl Reader for FastqReader where R: Read + Send, { - fn next(&mut self) -> Result> { - self.read_lines() + fn next(&mut self) -> Result>> { + let seqs: Vec = (0..self.batch_size) + .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> + .collect::>>()?; + + Ok(Some(seqs).filter(|v| !v.is_empty())) + } +} + +/// FastqPairReader +pub struct FastqPairReader { + inner1: QReader, + inner2: QReader, + file_index: usize, + reads_index: usize, + // 批量读取 + batch_size: usize, +} + +impl FastqPairReader +where + R: Read + Send, +{ + pub fn new(reader1: R, reader2: R, file_index: usize, score: i32) -> Self { + Self::with_capacity(reader1, reader2, file_index, BUFSIZE, score, 30) + } + + pub fn with_capacity<'a>( + reader1: R, + reader2: R, + file_index: usize, + capacity: usize, + score: i32, + batch_size: usize, + ) -> Self { + assert!(capacity >= 3); + Self { + inner1: QReader::with_capacity(reader1, capacity, score), + inner2: QReader::with_capacity(reader2, capacity, score), + file_index, + reads_index: 0, + batch_size, + } + } + + pub fn read_next(&mut self) -> Result> { + if self.inner1.read_next()?.is_none() { + return Ok(None); + } + + if self.inner2.read_next()?.is_none() { + return Ok(None); + } + + let seq_id = unsafe { + let s = std::str::from_utf8_unchecked(&self.inner1.header[1..]); + let first_space_index = s + .as_bytes() + .iter() + .position(|&c| c == b' ') + .unwrap_or(s.len()); + + // 直接从原始切片创建第一个单词的切片 + &s[..first_space_index] + }; + self.reads_index += 1; + + let sequence = Sequence { + file_index: self.file_index, + reads_index: self.reads_index, + id: seq_id.to_owned(), + seq: BaseType::Pair((self.inner1.seq.to_owned(), self.inner2.seq.to_owned())), + format: SeqFormat::PairFastq, + }; + Ok(Some(sequence)) + } +} + +impl FastqPairReader> { + #[inline] + pub fn from_path>( + path1: P, + path2: P, + file_index: usize, + quality_score: i32, + ) -> Result { + let reader1 = dyn_reader(path1)?; + let reader2 = dyn_reader(path2)?; + Ok(Self::new(reader1, reader2, file_index, quality_score)) + } +} + +impl Reader for FastqPairReader +where + R: Read + Send, +{ + fn next(&mut self) -> Result>> { + let seqs: Vec = (0..self.batch_size) + .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> + .collect::>>()?; + + Ok(Some(seqs).filter(|v| !v.is_empty())) } } diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs index 758e421..6a6d58d 100644 --- a/seqkmer/src/lib.rs +++ b/seqkmer/src/lib.rs @@ -1,4 +1,5 @@ -pub mod fastq; +mod fasta; +mod fastq; mod feat; pub mod mmscanner; pub mod reader; @@ -6,3 +7,5 @@ pub mod seq; pub use feat::constants::*; pub use feat::*; pub mod parallel; +pub use fasta::*; +pub use fastq::*; diff --git a/seqkmer/src/mmscanner.rs b/seqkmer/src/mmscanner.rs index b0d1fed..8fe7498 100644 --- a/seqkmer/src/mmscanner.rs +++ b/seqkmer/src/mmscanner.rs @@ -1,4 +1,5 @@ // kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash +use crate::seq::{BaseType, Marker}; use crate::{ canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, }; @@ -15,7 +16,7 @@ fn to_candidate_lmer(meros: &Meros, lmer: u64) -> u64 { #[derive(Debug)] struct MinimizerData { - pub pos: usize, + pos: usize, candidate_lmer: u64, } @@ -134,14 +135,14 @@ impl Cursor { } pub struct MinimizerScanner<'a> { - seq: &'a [u8], + seq: &'a BaseType>, meros: Meros, cursor: Cursor, window: MinimizerWindow, } impl<'a> MinimizerScanner<'a> { - pub fn new(seq: &'a [u8], meros: Meros) -> Self { + pub fn new(seq: &'a BaseType>, meros: Meros) -> Self { MinimizerScanner { seq, meros, @@ -156,27 +157,33 @@ impl<'a> MinimizerScanner<'a> { self.window.clear(); } - pub fn iter(&mut self) -> Vec { - self.seq + fn iter_seq(&mut self, seq: &Vec) -> Marker { + let minimizer = seq .iter() .filter_map(|&ch| { - // if ch == b'\n' || ch == b'\r' { - // None - // } else { - match char_to_value(ch) { - Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { - let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); - self.window - .next(candidate_lmer) - .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) - }), - None => { - self.clear(); - None + if ch == b'\n' || ch == b'\r' { + None + } else { + match char_to_value(ch) { + Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { + let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); + self.window + .next(candidate_lmer) + .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) + }), + None => { + self.clear(); + None + } } } - // } }) - .collect() + .collect(); + + Marker::new(seq.len(), minimizer) + } + + pub fn iter(&mut self) -> BaseType { + self.seq.apply(|seq| self.iter_seq(seq)) } } diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index 505b3d1..644c6ee 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -1,83 +1,86 @@ -use crate::mmscanner::MinimizerScanner; -use crate::reader::Reader; +use crate::reader::{Reader, SeqMer}; use crate::seq::Sequence; use crate::Meros; -use crossbeam_channel::bounded; +use crossbeam_channel::{bounded, Receiver, RecvError}; use scoped_threadpool::Pool; -use std::fs::File; -use std::io::Read; -use std::io::{self, BufWriter, Result, Write}; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::io::Result; use std::sync::Arc; -pub fn read_parallel( - reader: &mut dyn Reader, +pub struct ParallelResult

+where + P: Send, +{ + recv: Receiver

, +} + +impl

ParallelResult

+where + P: Send, +{ + #[inline] + pub fn next(&mut self) -> std::result::Result { + self.recv.recv() + } +} + +pub fn read_parallel( + reader: &mut Box, n_threads: usize, buffer_len: usize, - output_file: Option<&PathBuf>, meros: Meros, work: W, + func: F, ) -> Result<()> where - R: Read + Send, - W: Send + Sync + Fn(Vec, Sequence) -> Option, + O: Send, + Out: Send + Default, + W: Send + Sync + Fn(Vec) -> Option, + F: FnOnce(&mut ParallelResult>) -> Out + Send, { + assert!(n_threads > 2); assert!(n_threads <= buffer_len); - let (sender, receiver) = bounded::(buffer_len); + let (sender, receiver) = bounded::>(buffer_len); + let (done_send, done_recv) = bounded::>(buffer_len); let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver - let mut pool = Pool::new(10); - - let counter = Arc::new(AtomicUsize::new(0)); + let done_send = Arc::new(done_send); + let mut pool = Pool::new(n_threads as u32); - let mut writer: Box = match output_file { - Some(file_name) => { - let file = File::create(file_name)?; - Box::new(BufWriter::new(file)) as Box - } - None => Box::new(io::stdout()) as Box, - }; + let mut parallel_result = ParallelResult { recv: done_recv }; - let _ = pool.scoped(|pool_scope| -> Result<()> { + pool.scoped(|pool_scope| { // 生产者线程 pool_scope.execute(move || { - while let Some(seq) = reader.next().unwrap() { - sender.send(seq).unwrap(); + while let Ok(Some(seqs)) = reader.next() { + sender.send(seqs).expect("Failed to send sequences"); } }); // 消费者线程 - for i in 0..n_threads { + for _ in 0..n_threads - 2 { let receiver = Arc::clone(&receiver); - let counter_clone = Arc::clone(&counter); let work = &work; - - let mut temp_writer: Box = match output_file { - Some(file_name) => { - let parent_dir = file_name.parent().unwrap_or_else(|| Path::new("")); - let file_name = file_name.file_name().unwrap().to_str().unwrap(); - let filename = parent_dir.join(format!("{}.tmp.{}", file_name, i)); - let file = File::create(filename)?; - Box::new(BufWriter::new(file)) as Box - } - None => Box::new(io::stdout()) as Box, - }; + let done_send = Arc::clone(&done_send); pool_scope.execute(move || { - while let Ok(seq) = receiver.recv() { - counter_clone.fetch_add(1, Ordering::Relaxed); - let mut ms = MinimizerScanner::new(&seq.seq, meros); - let res = ms.iter(); - if let Some(out) = work(res, seq) { - temp_writer - .write_all(out.as_bytes()) - .expect("write data error"); - } + while let Ok(seqs) = receiver.recv() { + let seq_mers: Vec = seqs + .iter() + .map(|seq| SeqMer::from_seq(seq, meros)) + .collect(); + + let output = work(seq_mers); + done_send.send(output).expect("Failed to send outputs"); } }); } + + // 引用计数减掉一个,这样都子线程结束时, done_send还能完全释放 + drop(done_send); + pool_scope.execute(move || { + let _ = func(&mut parallel_result); + }); + pool_scope.join_all(); - Ok(()) }); - println!("counter {:?}", counter.load(Ordering::Relaxed)); + Ok(()) } diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs index c7bc395..04c27e1 100644 --- a/seqkmer/src/reader.rs +++ b/seqkmer/src/reader.rs @@ -1,4 +1,5 @@ -use crate::seq::Sequence; +use crate::seq::{BaseType, Marker, Sequence}; +use crate::{mmscanner::MinimizerScanner, Meros}; use flate2::read::GzDecoder; use std::fs::File; use std::io::{self, Read, Result, Seek}; @@ -32,13 +33,45 @@ pub fn open_file>(path: P) -> Result { } pub fn trim_end(buffer: &mut Vec) { - while let Some(&b'\n' | &b'\r') = buffer.last() { + while let Some(&b'\n' | &b'\r' | &b'>' | &b'@') = buffer.last() { buffer.pop(); } } -pub const BUFSIZE: usize = 8 * 1024 * 1024; +pub const BUFSIZE: usize = 16 * 1024 * 1024; -pub trait Reader: Send { - fn next(&mut self) -> Result>; +pub trait Reader: Send { + fn next(&mut self) -> Result>>; +} + +#[derive(Debug, Clone)] +pub struct SeqMer { + pub id: String, + pub file_index: usize, + pub reads_index: usize, + pub marker: BaseType, +} + +impl SeqMer { + pub fn from_seq(seq: &Sequence, meros: Meros) -> Self { + let mut ms = MinimizerScanner::new(&seq.seq, meros); + let marker = ms.iter(); + Self { + marker, + id: seq.id.clone(), + file_index: seq.file_index, + reads_index: seq.reads_index, + } + } + + pub fn size_str(&self) -> BaseType { + self.marker.apply(|marker| marker.cap.to_string()) + } + + pub fn fmt_size(&self) -> String { + match &self.marker { + BaseType::Single(marker1) => marker1.cap.to_string(), + BaseType::Pair((marker1, marker2)) => format!("{}:{}", marker1.cap, marker2.cap), + } + } } diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs index bfd4de8..c0e2840 100644 --- a/seqkmer/src/seq.rs +++ b/seqkmer/src/seq.rs @@ -1,14 +1,56 @@ #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum SeqFormat { - FASTA, - FASTQ, + Fasta, + Fastq, + PairFastq, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BaseType { + Single(U), + Pair((U, U)), +} + +impl BaseType { + // 泛型方法,根据序列类型执行操作 + pub fn apply(&self, mut func: F) -> BaseType + where + F: FnMut(&T) -> U, + { + match self { + BaseType::Single(seq) => BaseType::Single(func(seq)), + BaseType::Pair((seq1, seq2)) => BaseType::Pair((func(seq1), func(seq2))), + } + } +} + +impl BaseType> { + pub fn len(&self) -> BaseType { + self.apply(|seq| seq.len()) + } +} + +#[derive(Debug, Clone)] +pub struct Marker { + pub cap: usize, + pub minimizer: Vec, +} + +impl Marker { + pub fn new(cap: usize, minimizer: Vec) -> Self { + Self { cap, minimizer } + } + + pub fn size(&self) -> usize { + self.minimizer.len() + } } #[derive(Debug, Clone)] pub struct Sequence { - pub file_index: u64, - pub reads_index: u64, + pub file_index: usize, + pub reads_index: usize, pub id: String, - pub seq: Vec, + pub seq: BaseType>, pub format: SeqFormat, } From 16f465bce906cc82165e4ebd14b74511bcbdeb1a Mon Sep 17 00:00:00 2001 From: dagou Date: Sun, 16 Jun 2024 15:26:53 +0800 Subject: [PATCH 03/18] seq kmer --- kr2r/Cargo.toml | 14 +- kr2r/benches/hash_benchmark.rs | 32 ---- kr2r/benches/mmscanner_benchmark.rs | 20 --- kr2r/src/args.rs | 10 +- kr2r/src/bin/classify.rs | 73 +++----- kr2r/src/bin/estimate_capacity.rs | 39 ++-- kr2r/src/bin/splitr.rs | 234 +++++++----------------- kr2r/src/classify.rs | 87 +++++++++ kr2r/src/db.rs | 107 ++++++++--- kr2r/src/feat.rs | 121 ------------- kr2r/src/kr2r_data.rs | 75 +------- kr2r/src/lib.rs | 9 +- kr2r/src/mmscanner.rs | 270 ---------------------------- kr2r/src/seq.rs | 270 ---------------------------- seqkmer/src/feat.rs | 2 +- seqkmer/src/lib.rs | 16 +- seqkmer/src/parallel.rs | 11 +- seqkmer/src/reader.rs | 64 ++++++- seqkmer/src/seq.rs | 33 +++- 19 files changed, 399 insertions(+), 1088 deletions(-) delete mode 100644 kr2r/benches/hash_benchmark.rs delete mode 100644 kr2r/benches/mmscanner_benchmark.rs delete mode 100644 kr2r/src/feat.rs delete mode 100644 kr2r/src/mmscanner.rs delete mode 100644 kr2r/src/seq.rs diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml index 791d524..e662178 100644 --- a/kr2r/Cargo.toml +++ b/kr2r/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kr2r" -version = "0.5.0" +version = "0.5.1" edition = "2021" authors = ["eric9n@gmail.com"] @@ -11,16 +11,12 @@ name = "kun_peng" path = "src/bin/kun.rs" [features] -default = ["dna"] -dna = [] -protein = [] double_hashing = [] exact_counting = [] [dependencies] seqkmer = { version = "0.1.0", path = "../seqkmer" } clap = { version = "4.4.10", features = ["derive"] } -seq_io = "0.3.2" hyperloglogplus = { version = "*", features = ["const-loop"] } seahash = "4.1.0" serde = { version = "1.0", features = ["derive"] } @@ -38,11 +34,3 @@ criterion = "0.5.1" twox-hash = "1.6.3" farmhash = {version = "1.1.5"} -[[bench]] -name = "mmscanner_benchmark" -harness = false - - -[[bench]] -name = "hash_benchmark" -harness = false diff --git a/kr2r/benches/hash_benchmark.rs b/kr2r/benches/hash_benchmark.rs deleted file mode 100644 index cba911c..0000000 --- a/kr2r/benches/hash_benchmark.rs +++ /dev/null @@ -1,32 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use kr2r::{fmix64, murmur_hash3, sea_hash}; -use std::hash::Hasher; -use twox_hash::xxh3; -extern crate farmhash; - -#[inline] -pub fn xx_hash(key: u64) -> u64 { - let mut xhash = xxh3::Hash64::default(); - xhash.write_u64(key); - xhash.finish() -} - -#[inline] -pub fn farm(key: u64) -> u64 { - // let bytes = key.to_be_bytes(); - // let byte_slce: &[u8] = &bytes; - farmhash::hash64(&key.to_be_bytes()) -} - -fn criterion_benchmark(c: &mut Criterion) { - let key = 0x12345678abcdef01u64; - - c.bench_function("fmix64", |b| b.iter(|| fmix64(black_box(key)))); - c.bench_function("murmur_hash3", |b| b.iter(|| murmur_hash3(black_box(key)))); - c.bench_function("sea_hash", |b| b.iter(|| sea_hash(black_box(key)))); - c.bench_function("xx_hash", |b| b.iter(|| xx_hash(black_box(key)))); - c.bench_function("farm", |b| b.iter(|| farm(black_box(key)))); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/kr2r/benches/mmscanner_benchmark.rs b/kr2r/benches/mmscanner_benchmark.rs deleted file mode 100644 index d728c15..0000000 --- a/kr2r/benches/mmscanner_benchmark.rs +++ /dev/null @@ -1,20 +0,0 @@ -use criterion::{criterion_group, criterion_main, Criterion}; -use kr2r::mmscanner::MinimizerScanner; -use kr2r::Meros; -// 定义性能测试函数 -fn performance_test(c: &mut Criterion) { - let seq: Vec = b"ACGATCGACGACG".to_vec(); - let meros = Meros::new(10, 5, None, None, None); - let mut scanner = MinimizerScanner::new(&seq, meros); - // 这里执行需要测试性能的操作,例如多次调用 next_minimizer - c.bench_function("next", |b| { - b.iter(|| { - let _ = scanner.next(); - // let _ = scanner.next_minimizer(&seq); - }); - }); -} - -// 创建性能测试组 -criterion_group!(benches, performance_test); -criterion_main!(benches); diff --git a/kr2r/src/args.rs b/kr2r/src/args.rs index 43ebe18..aed1933 100644 --- a/kr2r/src/args.rs +++ b/kr2r/src/args.rs @@ -1,10 +1,12 @@ // 使用时需要引用模块路径 use crate::utils::expand_spaced_seed_mask; -use crate::{construct_seed_template, parse_binary, Meros, BITS_PER_CHAR}; -use crate::{ - DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, DEFAULT_MINIMIZER_SPACES, DEFAULT_TOGGLE_MASK, -}; +use crate::{construct_seed_template, parse_binary}; use clap::Parser; +use seqkmer::Meros; +use seqkmer::{ + BITS_PER_CHAR, DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, DEFAULT_MINIMIZER_SPACES, + DEFAULT_TOGGLE_MASK, +}; use std::path::PathBuf; pub const U32MAXPLUS: u64 = u32::MAX as u64; diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index ab448c6..c6d6376 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -1,5 +1,5 @@ use clap::Parser; -use kr2r::classify::{add_hitlist_string, count_values, resolve_tree, trim_pair_info}; +use kr2r::classify::{adjust_hitlist_string, count_rows, resolve_tree, trim_pair_info}; use kr2r::compact_hash::{CHTable, Compact, HashConfig, Row}; use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; @@ -8,7 +8,6 @@ use kr2r::utils::{ create_sample_file, detect_file_format, find_and_sort_files, get_lastest_file_index, FileFormat, }; use kr2r::IndexOptions; -use seqkmer::seq::BaseType; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -17,10 +16,10 @@ use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Instant; -use seqkmer::parallel::read_parallel; -use seqkmer::reader::SeqMer; -use seqkmer::Meros; -use seqkmer::{reader::Reader, FastaReader, FastqPairReader, FastqReader}; +use seqkmer::{ + read_parallel, BaseType, FastaReader, FastqPairReader, FastqReader, HitGroup, Marker, Meros, + Reader, SeqMer, +}; #[derive(Parser, Debug, Clone)] #[clap( @@ -92,17 +91,12 @@ pub struct Args { pub input_files: Vec, } -fn process_seq( - minimizer: &Vec, - hash_config: &HashConfig, - chtable: &CHTable, - offset: u32, -) -> Vec { +fn process_seq(marker: &Marker, hash_config: &HashConfig, chtable: &CHTable) -> HitGroup { let chunk_size = hash_config.hash_capacity; let value_bits = hash_config.value_bits; let mut rows = Vec::new(); - for (sort, &hash_key) in minimizer.iter().enumerate() { + for (sort, &hash_key) in marker.minimizer.iter().enumerate() { let idx = hash_config.index(hash_key); let partition_index = idx / chunk_size; let index = idx % chunk_size; @@ -110,11 +104,11 @@ fn process_seq( if taxid > 0 { let compacted_key = hash_key.left(value_bits) as u32; let high = u32::combined(compacted_key, taxid, value_bits); - let row = Row::new(high, 0, sort as u32 + 1 + offset); + let row = Row::new(high, 0, sort as u32 + 1); rows.push(row); } } - rows + HitGroup::new(marker.size(), rows, 0) } fn process_record( @@ -129,30 +123,14 @@ fn process_record( ) -> String { let value_mask = hash_config.value_mask; - let seq_len_str = seq.fmt_size(); - let (kmer_count1, kmer_count2, rows) = match &seq.marker { - BaseType::Single(marker) => ( - marker.size(), - 0, - process_seq(&marker.minimizer, &hash_config, chtable, 0), - ), - BaseType::Pair((marker1, marker2)) => { - let mut rows = process_seq(&marker1.minimizer, &hash_config, chtable, 0); - let seq_len1 = marker1.size(); - let rows2 = process_seq(&marker2.minimizer, &hash_config, chtable, seq_len1 as u32); - rows.extend_from_slice(&rows2); - (seq_len1, marker2.size(), rows) - } - }; - let total_kmers = kmer_count1 + kmer_count2; - let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, kmer_count1 as u32); - let hit_string = add_hitlist_string( - &rows, - value_mask, - kmer_count1 as u32, - Some(kmer_count2 as u32), - taxonomy, - ); + let seq_len_str = seq.fmt_cap(); + let hits: BaseType> = seq + .marker + .apply(|marker| process_seq(&marker, &hash_config, chtable)); + + let total_kmers = seq.total_size(); + let (counts, cur_counts, hit_groups) = count_rows(&hits, value_mask); + let hit_string = adjust_hitlist_string(&hits, value_mask, taxonomy); let mut call = resolve_tree(&counts, taxonomy, total_kmers, args.confidence_threshold); if call > 0 && hit_groups < args.minimum_hit_groups { call = 0; @@ -186,16 +164,19 @@ fn process_record( output_line } -fn process_fastx_file( +fn process_fastx_file( args: &Args, meros: Meros, hash_config: HashConfig, file_index: usize, - reader: &mut Box, + reader: &mut R, chtable: &CHTable, taxonomy: &Taxonomy, total_taxon_counts: &mut TaxonCounters, -) -> io::Result<(usize, usize)> { +) -> io::Result<(usize, usize)> +where + R: Reader, +{ let mut writer: Box = match &args.kraken_output_dir { Some(ref file_path) => { let filename = file_path.join(format!("output_{}.txt", file_index)); @@ -212,8 +193,8 @@ fn process_fastx_file( let _ = read_parallel( reader, - 13, - 15, + args.num_threads as usize - 2, + args.num_threads as usize, meros, |seqs| { let mut buffer = String::new(); @@ -238,7 +219,7 @@ fn process_fastx_file( Some(buffer) }, |dataset| { - while let Ok(Some(res)) = dataset.next() { + while let Some(Some(res)) = dataset.next() { writer .write_all(res.as_bytes()) .expect("Failed to write date to file"); @@ -392,7 +373,7 @@ pub fn run(args: Args) -> Result<()> { } println!("start..."); let start = Instant::now(); - let meros = idx_opts.as_smeros(); + let meros = idx_opts.as_meros(); let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?; let chtable = CHTable::from_hash_files(hash_config, hash_files)?; diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs index c7c34d8..c27342d 100644 --- a/kr2r/src/bin/estimate_capacity.rs +++ b/kr2r/src/bin/estimate_capacity.rs @@ -1,11 +1,10 @@ use clap::{error::ErrorKind, Error, Parser}; use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; use kr2r::args::KLMTArgs; -use kr2r::mmscanner::MinimizerScanner; use kr2r::utils::{find_library_fna_files, format_bytes, open_file}; use kr2r::KBuildHasher; -use seq_io::fasta::{Reader, Record}; -use seq_io::parallel::read_parallel; + +use seqkmer::{read_parallel, FastaReader}; use serde_json; use std::collections::HashSet; use std::fs::File; @@ -84,33 +83,39 @@ fn process_sequence( let mut hllp: HyperLogLogPlus = HyperLogLogPlus::new(16, KBuildHasher::default()).unwrap(); - let reader = Reader::from_path(fna_file).unwrap(); + let mut reader = FastaReader::from_path(fna_file, 1).unwrap(); let range_n = args.n as u64; read_parallel( - reader, - args.threads as u32, - args.threads - 2 as usize, + &mut reader, + args.threads, + args.threads - 2, + meros, |record_set| { let mut minimizer_set = HashSet::new(); - for record in record_set.into_iter() { - let seq = record.seq(); - let kmer_iter = MinimizerScanner::new(&seq, meros) - .into_iter() - .filter(|hash_key| hash_key & RANGE_MASK < range_n) - .collect::>(); - minimizer_set.extend(kmer_iter); + for record in record_set.into_iter() { + record + .marker + .fold(&mut minimizer_set, |minimizer_set, marker| { + let kmer_iter = marker + .minimizer + .iter() + .filter(|&hash_key| hash_key & RANGE_MASK < range_n); + + minimizer_set.extend(kmer_iter); + }); } - minimizer_set + Some(minimizer_set) }, |record_sets| { - while let Some(Ok((_, m_set))) = record_sets.next() { + while let Some(Some(m_set)) = record_sets.next() { for minimizer in m_set { hllp.insert(&minimizer); } } }, - ); + ) + .expect("read parallel error"); // 序列化 hllp 对象并将其写入文件 let serialized_hllp = serde_json::to_string(&hllp).unwrap(); diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index 372fd8c..fb0046d 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -1,22 +1,16 @@ +use clap::Parser; use kr2r::compact_hash::{HashConfig, Slot}; -use kr2r::mmscanner::MinimizerScanner; -use kr2r::seq::{self, open_fasta_reader, SeqX}; use kr2r::utils::{ create_partition_files, create_partition_writers, create_sample_file, detect_file_format, get_file_limit, get_lastest_file_index, FileFormat, }; -use kr2r::{IndexOptions, Meros}; -use seq_io::fasta::Record; -use seq_io::fastq::Record as FqRecord; -use seq_io::parallel::read_parallel; +use kr2r::IndexOptions; +use seqkmer::{read_parallel, FastaReader, FastqPairReader, FastqReader, Marker, Meros, Reader}; use std::fs; use std::io::{BufWriter, Write}; use std::io::{Error, ErrorKind, Result}; use std::path::PathBuf; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Instant; - -use clap::Parser; /// Command line arguments for the splitr program. /// /// This structure defines the command line arguments that are accepted by the splitr program. @@ -100,31 +94,23 @@ fn init_chunk_writers( } /// 处理record -fn process_record( - iter: I, +fn process_record( + k2_slot_list: &mut Vec<(usize, Slot)>, + marker: &Marker, hash_config: &HashConfig, - seq_id: u64, chunk_size: usize, + seq_id: u64, idx_bits: usize, - seq_index: &AtomicUsize, -) -> (usize, Vec<(usize, Slot)>) -where - I: Iterator, -{ - let mut k2_slot_list = Vec::new(); - let mut kmer_count = 0; - - for hash_key in iter.into_iter() { +) { + let offset = k2_slot_list.len(); + for (sort, &hash_key) in marker.minimizer.iter().enumerate() { let mut slot = hash_config.slot_u64(hash_key, seq_id); - let seq_sort = seq_index.fetch_add(1, Ordering::SeqCst); + let seq_sort = sort + offset; let partition_index = slot.idx / chunk_size; slot.idx = seq_sort << idx_bits | (slot.idx % chunk_size); - k2_slot_list.push((partition_index, slot)); - kmer_count += 1; } - (kmer_count, k2_slot_list) } fn write_data_to_file( @@ -144,153 +130,57 @@ fn write_data_to_file( sample_writer.write_all(k2_map.as_bytes()).unwrap(); } -fn process_fastq_file( +fn process_fastx_file( args: &Args, meros: Meros, hash_config: HashConfig, file_index: usize, - files: &[String], + reader: &mut R, writers: &mut Vec>, sample_writer: &mut BufWriter, -) { +) -> Result<()> +where + R: Reader, +{ let chunk_size = hash_config.hash_capacity; let idx_bits = ((chunk_size as f64).log2().ceil() as usize).max(1); let slot_size = std::mem::size_of::>(); - let score = args.minimum_quality_score; - - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let file2 = files_iter.next().cloned(); - - let line_index = AtomicUsize::new(0); - let reader = seq::PairFastqReader::from_path(&file1, file2.as_ref()) - .expect("Unable to create pair reader from paths"); read_parallel( reader, - args.num_threads as u32, + args.num_threads as usize - 2, args.num_threads as usize, - |record_set| { - let mut k2_slot_list = Vec::new(); - + meros, + |seqs| { let mut buffer = String::new(); - - for records in record_set.into_iter() { - let dna_id = records.0.id().unwrap_or_default().to_string(); - // 拼接seq_id - let index = line_index.fetch_add(1, Ordering::SeqCst); - let seq_id = (file_index << 32 | index) as u64; - let seq_index = AtomicUsize::new(0); - - let seq1 = records.0.seq_x(score); - let scan1 = MinimizerScanner::new(&seq1, meros); - - let (kmer_count1, slot_list1) = process_record( - scan1, - &hash_config, - seq_id, - chunk_size, - idx_bits, - &seq_index, - ); - - k2_slot_list.extend(slot_list1); - let (kmer_count, seq_size) = if let Some(record3) = records.1 { - let seq2 = record3.seq_x(score); - let scan2 = MinimizerScanner::new(&seq2, meros); - let (kmer_count2, slot_list2) = process_record( - scan2, - &hash_config, - seq_id, - chunk_size, - idx_bits, - &seq_index, - ); - k2_slot_list.extend(slot_list2); - ( - format!("{}|{}", kmer_count1, kmer_count2), - format!("{}|{}", seq1.len(), seq2.len()), - ) - } else { - (kmer_count1.to_string(), format!("{}", seq1.len())) - }; - - buffer.push_str( - format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_size, kmer_count).as_str(), - ); - } - (buffer, k2_slot_list) - }, - |record_sets| { - while let Some(Ok((_, (k2_map, k2_slot_list)))) = record_sets.next() { - write_data_to_file(k2_map, k2_slot_list, writers, slot_size, sample_writer); - } - }, - ) -} - -fn process_fasta_file( - args: &Args, - meros: Meros, - hash_config: HashConfig, - file_index: usize, - files: &[String], - writers: &mut Vec>, - sample_writer: &mut BufWriter, -) { - let chunk_size = hash_config.hash_capacity; - let idx_bits = ((chunk_size as f64).log2().ceil() as usize).max(1); - let slot_size = std::mem::size_of::>(); - let score = args.minimum_quality_score; - - let mut files_iter = files.iter(); - let file1 = files_iter.next().cloned().unwrap(); - - let line_index = AtomicUsize::new(0); - - let reader = open_fasta_reader(&file1).expect("Unable to create fasta reader from path"); - read_parallel( - reader, - args.num_threads as u32, - args.num_threads as usize, - |record_set| { let mut k2_slot_list = Vec::new(); - - let mut buffer = String::new(); - - for records in record_set.into_iter() { - let dna_id = records.id().unwrap_or_default().to_string(); - // 拼接seq_id - let index = line_index.fetch_add(1, Ordering::SeqCst); + for seq in &seqs { + let dna_id = seq.id.to_owned(); + let index = seq.reads_index; let seq_id = (file_index << 32 | index) as u64; - let seq_index = AtomicUsize::new(0); - - let seq1 = records.seq_x(score); - let scan1 = MinimizerScanner::new(&seq1, meros); - - let (kmer_count1, slot_list) = process_record( - scan1, - &hash_config, - seq_id, - chunk_size, - idx_bits, - &seq_index, - ); - - k2_slot_list.extend(slot_list); - let (kmer_count, seq_size) = (kmer_count1.to_string(), format!("{}", seq1.len())); + let mut init: Vec<(usize, Slot)> = Vec::new(); + seq.marker.fold(&mut init, |init, marker| { + process_record(init, marker, &hash_config, chunk_size, seq_id, idx_bits) + }); + k2_slot_list.extend(init); + + let seq_cap_str = seq.fmt_cap(); + let seq_size_str = seq.fmt_size(); buffer.push_str( - format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_size, kmer_count).as_str(), + format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_cap_str, seq_size_str).as_str(), ); } - (buffer, k2_slot_list) + Some((buffer, k2_slot_list)) }, - |record_sets| { - while let Some(Ok((_, (k2_map, k2_slot_list)))) = record_sets.next() { - write_data_to_file(k2_map, k2_slot_list, writers, slot_size, sample_writer); + |dataset| { + while let Some(Some((buffer, k2_slot_list))) = dataset.next() { + write_data_to_file(buffer, k2_slot_list, writers, slot_size, sample_writer); } }, ) + .expect("failed"); + + Ok(()) } fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { @@ -322,30 +212,32 @@ fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { let mut sample_writer = create_sample_file(args.chunk_dir.join(format!("sample_id_{}.map", file_index))); - match detect_file_format(&file_pair[0])? { + let mut files_iter = file_pair.iter(); + let file1 = files_iter.next().cloned().unwrap(); + let file2 = files_iter.next().cloned(); + let score = args.minimum_quality_score; + + let mut reader: Box = match detect_file_format(&file_pair[0])? { FileFormat::Fastq => { - process_fastq_file( - &args, - meros, - hash_config, - file_index, - file_pair, - &mut writers, - &mut sample_writer, - ); - } - FileFormat::Fasta => { - process_fasta_file( - &args, - meros, - hash_config, - file_index, - file_pair, - &mut writers, - &mut sample_writer, - ); + if let Some(file2) = file2 { + Box::new(FastqPairReader::from_path(file1, file2, file_index, score)?) + } else { + Box::new(FastqReader::from_path(file1, file_index, score)?) + } } - } + FileFormat::Fasta => Box::new(FastaReader::from_path(file1, file_index)?), + }; + + process_fastx_file( + &args, + meros, + hash_config, + file_index, + &mut reader, + &mut writers, + &mut sample_writer, + ) + .expect("process fastx file error"); } Ok(()) }; diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index aadb67a..e13cbe2 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -1,3 +1,5 @@ +use seqkmer::{BaseType, HitGroup}; + use crate::compact_hash::{Compact, Row}; use crate::readcounts::TaxonCountersDash; use crate::taxonomy::Taxonomy; @@ -165,3 +167,88 @@ pub fn count_values( (counts, cur_taxon_counts, hit_count) } + +fn gen_hit_string(hit: &HitGroup, taxonomy: &Taxonomy, value_mask: usize) -> String { + let mut result = Vec::new(); + let mut last_pos = 0; + let count = hit.cap as u32; + + for row in &hit.rows { + let adjusted_pos = row.kmer_id - hit.offset; + + let value = row.value; + let key = value.right(value_mask); + let ext_code = taxonomy.nodes[key as usize].external_id; + + if last_pos == 0 && adjusted_pos > 0 { + result.push((0, adjusted_pos)); // 在开始处添加0 + } else if adjusted_pos - last_pos > 1 { + result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 + } + if let Some(last) = result.last_mut() { + if last.0 == ext_code { + last.1 += 1; + last_pos = adjusted_pos; + continue; + } + } + + // 添加当前key的计数 + result.push((ext_code, 1)); + last_pos = adjusted_pos; + } + + // 填充尾随0 + if last_pos < count - 1 { + if last_pos == 0 { + result.push((0, count - last_pos)); + } else { + result.push((0, count - last_pos - 1)); + } + } + + result + .iter() + .map(|i| format!("{}:{}", i.0, i.1)) + .collect::>() + .join(" ") +} + +pub fn adjust_hitlist_string( + hits: &BaseType>, + value_mask: usize, + taxonomy: &Taxonomy, +) -> String { + let hit_str = hits.apply(|hit| gen_hit_string(hit, taxonomy, value_mask)); + match hit_str { + BaseType::Single(hit) => hit, + BaseType::Pair((hit1, hit2)) => format!("{} |:| {}", hit1, hit2), + } +} + +pub fn count_rows( + hit: &BaseType>, + value_mask: usize, +) -> (HashMap, TaxonCountersDash, usize) { + let mut counts = HashMap::new(); + + let mut hit_count: usize = 0; + + let cur_taxon_counts = TaxonCountersDash::new(); + + hit.apply(|group| { + for row in &group.rows { + let value = row.value; + let key = value.right(value_mask); + *counts.entry(key).or_insert(0) += 1; + + cur_taxon_counts + .entry(key as u64) + .or_default() + .add_kmer(value as u64); + hit_count += 1; + } + }); + + (counts, cur_taxon_counts, hit_count) +} diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index 5570cb5..cc52309 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -1,20 +1,20 @@ // 使用时需要引用模块路径 use crate::compact_hash::{Compact, HashConfig, Slot}; -use crate::mmscanner::MinimizerScanner; +// use crate::mmscanner::MinimizerScanner; use crate::taxonomy::{NCBITaxonomy, Taxonomy}; -use crate::Meros; +use seqkmer::Meros; use crate::utils::open_file; use byteorder::{LittleEndian, WriteBytesExt}; use rayon::prelude::*; -use seq_io::fasta::{Reader, Record}; -use seq_io::parallel::read_parallel; +// use seq_io::fasta::{Reader, Record}; +// use seq_io::parallel::read_parallel; +use seqkmer::{read_parallel as s_parallel, FastaReader}; use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, BufWriter, Read, Result as IOResult, Write}; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering}; - // 定义每批次处理的 Cell 数量 const BATCH_SIZE: usize = 81920; @@ -190,6 +190,58 @@ pub fn get_bits_for_taxid( Ok(bits_needed_for_value.max(requested_bits_for_taxid)) } +// /// 将fna文件转换成k2格式的临时文件 +// pub fn convert_fna_to_k2_format>( +// fna_file: P, +// meros: Meros, +// taxonomy: &Taxonomy, +// id_to_taxon_map: &HashMap, +// hash_config: HashConfig, +// writers: &mut Vec>, +// chunk_size: usize, +// threads: u32, +// ) { +// let reader = Reader::from_path(fna_file).unwrap(); +// let queue_len = (threads - 2) as usize; +// let value_bits = hash_config.value_bits; +// let cell_size = std::mem::size_of::>(); + +// read_parallel( +// reader, +// threads, +// queue_len, +// |record_set| { +// let mut k2_cell_list = Vec::new(); + +// for record in record_set.into_iter() { +// if let Ok(seq_id) = record.id() { +// if let Some(ext_taxid) = id_to_taxon_map.get(seq_id) { +// let taxid = taxonomy.get_internal_id(*ext_taxid); +// for hash_key in MinimizerScanner::new(record.seq(), meros).into_iter() { +// let index: usize = hash_config.index(hash_key); +// let idx = index % chunk_size; +// let partition_index = index / chunk_size; +// let cell = Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); +// k2_cell_list.push((partition_index, cell)); +// } +// }; +// } +// } +// k2_cell_list +// }, +// |record_sets| { +// while let Some(Ok((_, k2_cell_map))) = record_sets.next() { +// for cell in k2_cell_map { +// let partition_index = cell.0; +// if let Some(writer) = writers.get_mut(partition_index) { +// writer.write_all(&cell.1.as_slice(cell_size)).unwrap(); +// } +// } +// } +// }, +// ); +// } + /// 将fna文件转换成k2格式的临时文件 pub fn convert_fna_to_k2_format>( fna_file: P, @@ -201,36 +253,40 @@ pub fn convert_fna_to_k2_format>( chunk_size: usize, threads: u32, ) { - let reader = Reader::from_path(fna_file).unwrap(); + let mut reader = FastaReader::from_path(fna_file, 1).unwrap(); let queue_len = (threads - 2) as usize; let value_bits = hash_config.value_bits; let cell_size = std::mem::size_of::>(); - read_parallel( - reader, - threads, + s_parallel( + &mut reader, + threads as usize, queue_len, - |record_set| { + meros, + |seqs| { let mut k2_cell_list = Vec::new(); - for record in record_set.into_iter() { - if let Ok(seq_id) = record.id() { - if let Some(ext_taxid) = id_to_taxon_map.get(seq_id) { - let taxid = taxonomy.get_internal_id(*ext_taxid); - for hash_key in MinimizerScanner::new(record.seq(), meros).into_iter() { - let index: usize = hash_config.index(hash_key); - let idx = index % chunk_size; - let partition_index = index / chunk_size; - let cell = Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); - k2_cell_list.push((partition_index, cell)); - } - }; + for record in seqs.iter() { + if let Some(ext_taxid) = id_to_taxon_map.get(&record.id) { + let taxid = taxonomy.get_internal_id(*ext_taxid); + record + .marker + .fold(&mut k2_cell_list, |k2_cell_list, marker| { + for &hash_key in marker.minimizer.iter() { + let index: usize = hash_config.index(hash_key); + let idx = index % chunk_size; + let partition_index = index / chunk_size; + let cell = + Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); + k2_cell_list.push((partition_index, cell)); + } + }); } } - k2_cell_list + Some(k2_cell_list) }, |record_sets| { - while let Some(Ok((_, k2_cell_map))) = record_sets.next() { + while let Some(Some(k2_cell_map)) = record_sets.next() { for cell in k2_cell_map { let partition_index = cell.0; if let Some(writer) = writers.get_mut(partition_index) { @@ -239,5 +295,6 @@ pub fn convert_fna_to_k2_format>( } } }, - ); + ) + .expect("failed"); } diff --git a/kr2r/src/feat.rs b/kr2r/src/feat.rs deleted file mode 100644 index 590bc81..0000000 --- a/kr2r/src/feat.rs +++ /dev/null @@ -1,121 +0,0 @@ -#[cfg(feature = "dna")] -pub mod constants { - pub const DEFAULT_KMER_LENGTH: u64 = 35; - pub const DEFAULT_MINIMIZER_LENGTH: u8 = 31; - pub const DEFAULT_MINIMIZER_SPACES: u8 = 7; - - pub const BITS_PER_CHAR: usize = 2; -} - -#[cfg(feature = "protein")] -pub mod constants { - pub const DEFAULT_KMER_LENGTH: u64 = 15; - pub const DEFAULT_MINIMIZER_LENGTH: u8 = 12; - pub const DEFAULT_MINIMIZER_SPACES: u8 = 0; - - pub const BITS_PER_CHAR: usize = 4; -} - -#[cfg(feature = "dna")] -#[inline] -pub fn char_to_value(c: u8) -> Option { - match c { - b'A' | b'a' => Some(0x00), - b'C' | b'c' => Some(0x01), - b'G' | b'g' => Some(0x02), - b'T' | b't' => Some(0x03), - _ => None, - } -} - -#[cfg(feature = "protein")] -#[inline] -pub fn char_to_value(c: u8) -> Option<64> { - match c { - // stop codons/rare amino acids - b'*' | b'U' | b'u' | b'O' | b'o' => Some(0x00), - // alanine - b'A' | b'a' => Some(0x01), - // asparagine, glutamine, serine - b'N' | b'n' | b'Q' | b'q' | b'S' | b's' => Some(0x02), - // cysteine - b'C' | b'c' => Some(0x03), - // aspartic acid, glutamic acid - b'D' | b'd' | b'E' | b'e' => Some(0x04), - // phenylalanine - b'F' | b'f' => Some(0x05), - // glycine - b'G' | b'g' => Some(0x06), - // histidine - b'H' | b'h' => Some(0x07), - // isoleucine, leucine - b'I' | b'i' | b'L' | b'l' => Some(0x08), - // lysine - b'K' | b'k' => Some(0x09), - // proline - b'P' | b'p' => Some(0x0a), - // arginine - b'R' | b'r' => Some(0x0b), - // methionine, valine - b'M' | b'm' | b'V' | b'v' => Some(0x0c), - // threonine - b'T' | b't' => Some(0x0d), - // tryptophan - b'W' | b'w' => Some(0x0e), - // tyrosine - b'Y' | b'y' => Some(0x0f), - _ => None, - } -} - -#[inline] -fn reverse_complement(mut kmer: u64, n: usize) -> u64 { - // Reverse bits while leaving bit pairs (nucleotides) intact. - - // Swap consecutive pairs of bits - kmer = (kmer >> 2 & 0x3333333333333333) | (kmer << 2 & 0xCCCCCCCCCCCCCCCC); - - // Swap consecutive nibbles (4-bit groups) - kmer = (kmer >> 4 & 0x0F0F0F0F0F0F0F0F) | (kmer << 4 & 0xF0F0F0F0F0F0F0F0); - - // Swap consecutive bytes - kmer = (kmer >> 8 & 0x00FF00FF00FF00FF) | (kmer << 8 & 0xFF00FF00FF00FF00); - - // Swap consecutive pairs of bytes - kmer = (kmer >> 16 & 0x0000FFFF0000FFFF) | (kmer << 16 & 0xFFFF0000FFFF0000); - - // Swap the two halves of the 64-bit word - kmer = (kmer >> 32) | (kmer << 32); - - // Complement the bits, shift to the right length, and mask to get the desired length - (!kmer >> (64 - n * 2)) & ((1u64 << (n * 2)) - 1) - - // if revcom_version == 0 { - // // Complement the bits and mask to get the desired length - // !kmer & ((1u64 << (n * 2)) - 1) - // } else { - // // Complement the bits, shift to the right length, and mask to get the desired length - // (!kmer >> (64 - n * 2)) & ((1u64 << (n * 2)) - 1) - // } -} - -#[cfg(feature = "dna")] -#[inline] -pub fn canonical_representation(kmer: u64, n: usize) -> u64 { - let revcom = reverse_complement(kmer, n); - if kmer < revcom { - kmer - } else { - revcom - } -} - -#[cfg(feature = "protein")] -#[inline] -pub fn canonical_representation(kmer: u64, n: usize, revcom_version: u8) -> u64 { - kmer -} - -pub const DEFAULT_TOGGLE_MASK: u64 = 0xe37e28c4271b5a2d; -pub const DEFAULT_SPACED_SEED_MASK: u64 = 0; -pub const CURRENT_REVCOM_VERSION: u8 = 1; diff --git a/kr2r/src/kr2r_data.rs b/kr2r/src/kr2r_data.rs index 792f2c1..6e522e6 100644 --- a/kr2r/src/kr2r_data.rs +++ b/kr2r/src/kr2r_data.rs @@ -1,16 +1,12 @@ use crate::utils::open_file; // use crate::{Meros, CURRENT_REVCOM_VERSION}; -use crate::{ - BITS_PER_CHAR, CURRENT_REVCOM_VERSION, DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, - DEFAULT_SPACED_SEED_MASK, DEFAULT_TOGGLE_MASK, -}; +use seqkmer::Meros; +use seqkmer::CURRENT_REVCOM_VERSION; use std::fs::File; use std::io::{Read, Result as IoResult, Write}; use std::mem; use std::path::Path; -use seqkmer::Meros as SMeros; - pub fn parse_binary(src: &str) -> Result { u64::from_str_radix(src, 2) } @@ -29,63 +25,6 @@ pub fn construct_seed_template(minimizer_len: usize, minimizer_spaces: usize) -> format!("{}{}", core, spaces) } -/// minimizer config -#[derive(Copy, Debug, Clone)] -pub struct Meros { - pub k_mer: usize, - pub l_mer: usize, - pub mask: u64, - pub spaced_seed_mask: u64, - pub toggle_mask: u64, - pub min_clear_hash_value: Option, -} - -impl Meros { - pub fn new( - k_mer: usize, - l_mer: usize, - spaced_seed_mask: Option, - toggle_mask: Option, - min_clear_hash_value: Option, - ) -> Self { - let mut mask = 1u64; - mask <<= l_mer * BITS_PER_CHAR; - mask -= 1; - - Self { - k_mer, - l_mer, - mask, - spaced_seed_mask: spaced_seed_mask.unwrap_or(DEFAULT_SPACED_SEED_MASK), - toggle_mask: toggle_mask.unwrap_or(DEFAULT_TOGGLE_MASK) & mask, - min_clear_hash_value, - } - } - - pub fn window_size(&self) -> usize { - self.k_mer - self.l_mer - } -} - -impl Default for Meros { - fn default() -> Self { - let l_mer = DEFAULT_MINIMIZER_LENGTH as usize; - let k_mer = DEFAULT_KMER_LENGTH as usize; - let mut mask = 1u64; - mask <<= l_mer * BITS_PER_CHAR; - mask -= 1; - - Self { - k_mer, - l_mer, - mask, - spaced_seed_mask: DEFAULT_SPACED_SEED_MASK, - toggle_mask: DEFAULT_TOGGLE_MASK & mask, - min_clear_hash_value: Some(0), - } - } -} - /// 判断u64的值是否为0,并将其转换为Option类型 pub fn u64_to_option(value: u64) -> Option { Option::from(value).filter(|&x| x != 0) @@ -182,14 +121,4 @@ impl IndexOptions { u64_to_option(self.minimum_acceptable_hash_value), ) } - - pub fn as_smeros(&self) -> SMeros { - SMeros::new( - self.k, - self.l, - u64_to_option(self.spaced_seed_mask), - u64_to_option(self.toggle_mask), - u64_to_option(self.minimum_acceptable_hash_value), - ) - } } diff --git a/kr2r/src/lib.rs b/kr2r/src/lib.rs index 6e32bfb..8f5dd6e 100644 --- a/kr2r/src/lib.rs +++ b/kr2r/src/lib.rs @@ -1,17 +1,10 @@ -pub mod kr2r_data; +mod kr2r_data; mod kv_store; -pub mod mmscanner; pub mod readcounts; pub mod report; -pub mod seq; pub mod taxonomy; pub mod utils; -mod feat; - -pub use feat::constants::*; -pub use feat::*; - pub mod db; pub use kr2r_data::*; pub use kv_store::*; diff --git a/kr2r/src/mmscanner.rs b/kr2r/src/mmscanner.rs deleted file mode 100644 index 539788a..0000000 --- a/kr2r/src/mmscanner.rs +++ /dev/null @@ -1,270 +0,0 @@ -// kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash -use crate::{ - canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, -}; -use std::collections::VecDeque; - -#[inline] -fn to_candidate_lmer(meros: &Meros, lmer: u64) -> u64 { - let mut canonical_lmer = canonical_representation(lmer, meros.l_mer); - if meros.spaced_seed_mask > 0 { - canonical_lmer &= meros.spaced_seed_mask; - } - canonical_lmer ^ meros.toggle_mask -} - -#[derive(Debug)] -struct MinimizerData { - pub pos: usize, - candidate_lmer: u64, -} - -impl MinimizerData { - fn new(candidate_lmer: u64, pos: usize) -> Self { - Self { - candidate_lmer, - pos, - } - } -} - -pub struct MinimizerWindow { - queue: VecDeque, - queue_pos: usize, - /// 窗口队列的大小 - capacity: usize, - /// 队列计数 - count: usize, -} - -impl MinimizerWindow { - fn new(capacity: usize) -> Self { - Self { - queue: VecDeque::with_capacity(capacity), - capacity, - count: 0, - queue_pos: 0, - } - } - - #[inline] - fn next(&mut self, candidate_lmer: u64) -> Option { - // 无需比较,直接返回 - if self.capacity == 1 { - return Some(candidate_lmer); - } - - let data = MinimizerData::new(candidate_lmer, self.count); - - // 移除队列中所有比当前元素大的元素的索引 - // 因为它们不可能是当前窗口的最小值 - while let Some(m_data) = self.queue.back() { - if m_data.candidate_lmer > candidate_lmer { - self.queue.pop_back(); - } else { - break; - } - } - let mut changed = false; - - if (self.queue.is_empty() && self.count >= self.capacity) || self.count == self.capacity { - changed = true - } - // 将当前元素的索引添加到队列 - self.queue.push_back(data); - - while !self.queue.is_empty() - && self.queue.front().map_or(false, |front| { - self.count >= self.capacity && front.pos < self.count - self.capacity - }) - { - self.queue.pop_front(); - changed = true; - } - - self.count += 1; - if changed { - self.queue.front().map(|front| front.candidate_lmer) - } else { - None - } - } - - fn clear(&mut self) { - self.count = 0; - self.queue_pos = 0; - self.queue.clear(); - } -} - -struct Cursor { - pos: usize, - end: usize, - inner: Vec, - capacity: usize, - value: u64, - mask: u64, - window: MinimizerWindow, -} - -impl Cursor { - fn new(meros: &Meros, size: usize) -> Self { - Self { - pos: 0, - end: size, - inner: Vec::with_capacity(meros.l_mer), - capacity: meros.l_mer, - value: 0, - mask: meros.mask, - window: MinimizerWindow::new(meros.window_size()), - } - } - - /// 每次取一个 lmer 值出来,如果为空,表示一直 seq 已处理完成 - /// 遇到换行符,就跳过. - #[inline] - fn slide(&mut self, seq: &[u8]) -> Option { - while self.pos < self.end { - let ch = seq[self.pos]; - let code = if ch == b'\n' || ch == b'\r' { - self.pos += 1; - char_to_value(seq[self.pos]) - } else { - char_to_value(ch) - }; - self.pos += 1; - if let Some(c) = code { - if let Some(lmer) = self.next_lmer(c) { - return Some(lmer); - } - } else { - self.clear(); - } - } - None - } - - fn next_lmer(&mut self, item: u64) -> Option { - self.value <<= BITS_PER_CHAR; - self.value |= item; - if self.inner.len() == self.capacity { - self.inner.remove(0); // 移除最旧的元素 - } - self.inner.push(item); // 使用 push 方法 - if self.inner.len() >= self.capacity { - self.value &= self.mask; - return Some(self.value); - } - - None - } - - #[inline] - fn next_candidate_lmer(&mut self, item: u64) -> Option { - self.window.next(item) - } - - pub fn has_next(&self) -> bool { - self.pos < self.end - } - - // 清除元素 - #[inline] - fn clear(&mut self) { - self.inner.clear(); - self.value = 0; - self.window.clear(); - } -} - -pub struct MinimizerScanner<'a> { - seq: &'a [u8], - meros: Meros, - // l_mer: usize, - cursor: Cursor, - // 存最近一个最小值 - // last_minimizer: u64, -} - -impl<'a> MinimizerScanner<'a> { - pub fn new(seq: &'a [u8], meros: Meros) -> Self { - let size: usize = seq.len(); - MinimizerScanner { - seq, - meros, - cursor: Cursor::new(&meros, size), - // last_minimizer: std::u64::MAX, - } - } - - /// 在一个序列上滑动一个光标(可能是为了找到下一个有意义的片段或窗口), - /// 并对滑动得到的片段进行某种转换或处理。如果光标无法继续滑动(例如到达序列的末尾),则返回 None。 - fn next_window(&mut self) -> Option { - self.cursor.slide(&self.seq).and_then(|lmer| { - let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); - self.cursor.next_candidate_lmer(candidate_lmer) - }) - } -} - -impl<'a> Default for MinimizerScanner<'a> { - fn default() -> Self { - let meros = Meros::default(); - let seq: &[u8] = &[]; - MinimizerScanner::new(seq, meros) - } -} - -impl<'a> Iterator for MinimizerScanner<'a> { - type Item = u64; - - fn next(&mut self) -> Option { - while self.cursor.has_next() { - if let Some(minimizer) = self.next_window() { - return Some(murmur_hash3(minimizer ^ self.meros.toggle_mask)); - // if minimizer != self.last_minimizer { - // self.last_minimizer = minimizer; - // return Some(murmur_hash3(minimizer ^ self.meros.toggle_mask)); - // } - } - } - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_minimizer() { - // 1, 2, 3, 4 - let seq: Vec = vec![1, 2, 3, 4]; - // 窗口大小 = 2 - 0 + 1 - let mut mini: MinimizerWindow = MinimizerWindow::new(1); - let mut result = vec![]; - for s in seq { - if let Some(a) = mini.next(s) { - result.push(a); - } - } - // if let Some(a) = mini.get_last_minimizer() { - // result.push(a); - // } - assert_eq!(result, [1, 2, 3, 4]); - - let seq: Vec = vec![4, 3, 5, 2, 6, 2, 1]; - // 窗口大小 = 2 - 0 + 1 - let mut mini = MinimizerWindow::new(2); - let mut result = vec![]; - for s in seq { - if let Some(a) = mini.next(s) { - result.push(a); - } - } - // if let Some(a) = mini.get_last_minimizer() { - // result.push(a); - // } - assert_eq!(result, [3, 2, 2, 2, 1]); - } -} diff --git a/kr2r/src/seq.rs b/kr2r/src/seq.rs deleted file mode 100644 index db24286..0000000 --- a/kr2r/src/seq.rs +++ /dev/null @@ -1,270 +0,0 @@ -use crate::mmscanner::MinimizerScanner; -use crate::utils::open_file; -use seq_io::fasta; -use seq_io::fasta::Record as FaRecord; -use seq_io::fastq; -use seq_io::fastq::Record as FqRecord; - -use seq_io::parallel::Reader; - -use crate::utils::is_gzipped; -use crate::Meros; -use seq_io::policy::StdPolicy; -use std::collections::HashSet; -use std::io; -use std::iter; -use std::path::Path; - -type DefaultBufPolicy = StdPolicy; -use flate2::read::GzDecoder; - -pub trait SeqX { - fn seq_x(&self, score: i32) -> Vec; -} - -impl<'a> SeqX for fastq::RefRecord<'a> { - fn seq_x(&self, score: i32) -> Vec { - if score <= 0 { - return self.seq().to_vec(); - } - - let qual = self.qual(); - self.seq() - .iter() - .zip(qual.iter()) - .map(|(&base, &qscore)| { - if (qscore as i32 - '!' as i32) < score { - b'x' - } else { - base - } - }) - .collect::>() - } -} - -impl SeqX for fastq::OwnedRecord { - fn seq_x(&self, score: i32) -> Vec { - if score <= 0 { - return self.seq().to_vec(); - } - let qual = self.qual(); - self.seq() - .iter() - .zip(qual.iter()) - .map(|(&base, &qscore)| { - if (qscore as i32 - '!' as i32) < score { - b'x' - } else { - base - } - }) - .collect::>() - } -} - -impl<'a> SeqX for fasta::RefRecord<'a> { - #[allow(unused_variables)] - fn seq_x(&self, score: i32) -> Vec { - self.seq().to_vec() - } -} - -#[derive(Hash, PartialEq, Eq, PartialOrd, Ord)] -pub struct SeqReads { - pub dna_id: String, - pub seq_paired: Vec>, -} - -pub trait SeqSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet; -} - -pub fn open_fasta_reader>( - path1: P, -) -> io::Result>> { - let mut file1 = open_file(&path1)?; - - let read1: Box = if is_gzipped(&mut file1)? { - Box::new(GzDecoder::new(file1)) - } else { - Box::new(file1) - }; - - let reader1 = fasta::Reader::new(read1); - Ok(reader1) -} - -pub struct PairFastqReader

{ - reader1: fastq::Reader, P>, - reader2: Option, P>>, -} - -impl<'a> PairFastqReader { - /// Creates a reader from a file path. - #[inline] - pub fn from_path>(path1: P, path2: Option

) -> io::Result { - // 分别打开两个文件 - let mut file1 = open_file(&path1)?; - - let read1: Box = if is_gzipped(&mut file1)? { - Box::new(GzDecoder::new(file1)) - } else { - Box::new(file1) - }; - - let reader1 = fastq::Reader::new(read1); - - let reader2 = match path2 { - Some(path2) => { - let mut file2 = open_file(path2)?; - let read2: Box = if is_gzipped(&mut file2)? { - Box::new(GzDecoder::new(file2)) - } else { - Box::new(file2) - }; - Some(fastq::Reader::new(read2)) - } - None => None, - }; - - // 使用这两个实例构造一个 PairFastqReader 对象 - Ok(PairFastqReader { reader1, reader2 }) - } - - pub fn next(&mut self) -> Option { - let ref_record1 = self - .reader1 - .next()? - .expect("fastq file error") - .to_owned_record(); - let ref_record2 = match &mut self.reader2 { - Some(reader2) => Some(reader2.next()?.expect("fastq file error").to_owned_record()), - None => None, - }; - // let ref_recrod2 = self - // .reader2 - // .next()? - // .expect("fastq file error") - // .to_owned_record(); - - Some(PairFastqRecord(ref_record1, ref_record2)) - } -} - -pub struct PairFastqRecord(pub fastq::OwnedRecord, pub Option); - -pub struct PairFastqRecordSet(fastq::RecordSet, fastq::RecordSet); - -impl<'a> iter::IntoIterator for &'a PairFastqRecordSet { - type Item = (fastq::RefRecord<'a>, Option>); - type IntoIter = PairFastqRecordSetIter<'a>; - - #[inline] - fn into_iter(self) -> Self::IntoIter { - PairFastqRecordSetIter(self.0.into_iter(), self.1.into_iter()) - } -} - -pub struct PairFastqRecordSetIter<'a>(fastq::RecordSetIter<'a>, fastq::RecordSetIter<'a>); - -impl Default for PairFastqRecordSet { - fn default() -> Self { - PairFastqRecordSet(fastq::RecordSet::default(), fastq::RecordSet::default()) - } -} - -impl<'a> Iterator for PairFastqRecordSetIter<'a> { - type Item = (fastq::RefRecord<'a>, Option>); - - #[inline] - fn next(&mut self) -> Option { - match (self.0.next(), self.1.next()) { - (Some(record1), Some(record2)) => Some((record1, Some(record2))), - (Some(record1), None) => Some((record1, None)), - _ => None, // Return None if either iterator runs out of records - } - } -} - -impl

Reader for PairFastqReader

-where - P: seq_io::policy::BufPolicy + Send, -{ - type DataSet = PairFastqRecordSet; - type Err = fastq::Error; - - #[inline] - fn fill_data(&mut self, rset: &mut PairFastqRecordSet) -> Option> { - match self.reader1.read_record_set(&mut rset.0)? { - Ok(_) => { - if let Some(ref mut reader) = &mut self.reader2 { - match reader.read_record_set(&mut rset.1)? { - Ok(_) => Some(Ok(())), - Err(_) => None, - } - } else { - Some(Ok(())) - } - } - Err(e) => { - println!("{:?}", e); - None - } - } - - // If both reads are successful, return Ok(()) - // Some(Ok(())) - } -} - -impl SeqSet for PairFastqRecordSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet { - let mut seq_pair_set = HashSet::::new(); - - for records in self.into_iter() { - let dna_id = records.0.id().unwrap_or_default().to_string(); - let seq1 = records.0.seq_x(score); - if let Some(record3) = records.1 { - let seq2 = record3.seq_x(score); - let kmers1 = MinimizerScanner::new(&seq1, meros).collect(); - let kmers2 = MinimizerScanner::new(&seq2, meros).collect(); - let seq_paired: Vec> = vec![kmers1, kmers2]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } else { - let kmers1 = MinimizerScanner::new(&seq1, meros).collect(); - let seq_paired: Vec> = vec![kmers1]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } - } - seq_pair_set - } -} - -impl SeqSet for fastq::RecordSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet { - let mut seq_pair_set = HashSet::::new(); - for records in self.into_iter() { - let dna_id = records.id().unwrap_or_default().to_string(); - let seq1 = records.seq_x(score); - let kmers1: Vec = MinimizerScanner::new(&seq1, meros).collect(); - let seq_paired: Vec> = vec![kmers1]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } - seq_pair_set - } -} - -impl SeqSet for fasta::RecordSet { - fn to_seq_reads(&self, score: i32, meros: Meros) -> HashSet { - let mut seq_pair_set = HashSet::::new(); - for records in self.into_iter() { - let dna_id = records.id().unwrap_or_default().to_string(); - let seq1 = records.seq_x(score); - let kmers1: Vec = MinimizerScanner::new(&seq1, meros).collect(); - let seq_paired: Vec> = vec![kmers1]; - seq_pair_set.insert(SeqReads { dna_id, seq_paired }); - } - seq_pair_set - } -} diff --git a/seqkmer/src/feat.rs b/seqkmer/src/feat.rs index 8b06118..87b753f 100644 --- a/seqkmer/src/feat.rs +++ b/seqkmer/src/feat.rs @@ -194,7 +194,7 @@ impl Default for Meros { k_mer, l_mer, mask, - spaced_seed_mask: 4611686018212639539, // DEFAULT_SPACED_SEED_MASK + spaced_seed_mask: DEFAULT_SPACED_SEED_MASK, toggle_mask: DEFAULT_TOGGLE_MASK & mask, min_clear_hash_value: None, } diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs index 6a6d58d..7eab6e2 100644 --- a/seqkmer/src/lib.rs +++ b/seqkmer/src/lib.rs @@ -1,11 +1,15 @@ mod fasta; mod fastq; mod feat; -pub mod mmscanner; -pub mod reader; -pub mod seq; -pub use feat::constants::*; -pub use feat::*; -pub mod parallel; +mod mmscanner; +mod parallel; +mod reader; +mod seq; + pub use fasta::*; pub use fastq::*; +pub use feat::constants::*; +pub use feat::*; +pub use parallel::*; +pub use reader::*; +pub use seq::*; diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index 644c6ee..a9497d8 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -1,7 +1,7 @@ use crate::reader::{Reader, SeqMer}; use crate::seq::Sequence; use crate::Meros; -use crossbeam_channel::{bounded, Receiver, RecvError}; +use crossbeam_channel::{bounded, Receiver}; use scoped_threadpool::Pool; use std::io::Result; use std::sync::Arc; @@ -18,13 +18,13 @@ where P: Send, { #[inline] - pub fn next(&mut self) -> std::result::Result { - self.recv.recv() + pub fn next(&mut self) -> Option

{ + self.recv.recv().ok() } } -pub fn read_parallel( - reader: &mut Box, +pub fn read_parallel( + reader: &mut R, n_threads: usize, buffer_len: usize, meros: Meros, @@ -32,6 +32,7 @@ pub fn read_parallel( func: F, ) -> Result<()> where + R: Reader, O: Send, Out: Send + Default, W: Send + Sync + Fn(Vec) -> Option, diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs index 04c27e1..1301f16 100644 --- a/seqkmer/src/reader.rs +++ b/seqkmer/src/reader.rs @@ -44,6 +44,12 @@ pub trait Reader: Send { fn next(&mut self) -> Result>>; } +impl Reader for Box { + fn next(&mut self) -> Result>> { + (**self).next() + } +} + #[derive(Debug, Clone)] pub struct SeqMer { pub id: String, @@ -64,14 +70,66 @@ impl SeqMer { } } - pub fn size_str(&self) -> BaseType { + pub fn cap_str(&self) -> BaseType { self.marker.apply(|marker| marker.cap.to_string()) } - pub fn fmt_size(&self) -> String { + pub fn total_size(&self) -> usize { + match &self.marker { + BaseType::Single(marker1) => marker1.size(), + BaseType::Pair((marker1, marker2)) => marker1.size() + marker2.size(), + } + } + + pub fn fmt_cap(&self) -> String { match &self.marker { BaseType::Single(marker1) => marker1.cap.to_string(), - BaseType::Pair((marker1, marker2)) => format!("{}:{}", marker1.cap, marker2.cap), + BaseType::Pair((marker1, marker2)) => format!("{}|{}", marker1.cap, marker2.cap), + } + } + + pub fn fmt_size(&self) -> String { + match &self.marker { + BaseType::Single(marker1) => marker1.size().to_string(), + BaseType::Pair((marker1, marker2)) => format!("{}|{}", marker1.size(), marker2.size()), + } + } +} + +#[derive(Debug)] +pub struct HitGroup { + /// minimizer capacity + pub cap: usize, + /// hit value vector + pub rows: Vec, + /// pair offset + pub offset: u32, +} + +impl HitGroup { + pub fn new(cap: usize, rows: Vec, offset: u32) -> Self { + Self { cap, rows, offset } + } +} + +impl BaseType> { + /// Synchronizes the offset of the second element of a `Pair` to the `cap` of the first element. + /// This alignment is only necessary when the `rows` property of the `HitGroup` is in an + /// increasing order. If `rows` is not increasing, aligning the offset based on `cap` may not + /// be appropriate or required. + /// + /// # Example + /// + /// ``` + /// let mut hit_group1 = HitGroup::new(10, vec![1, 2, 3], 0); // Increasing `rows` + /// let mut hit_group2 = HitGroup::new(20, vec![4, 5, 6], 0); + /// + /// let mut pair = BaseType::Pair((hit_group1, hit_group2)); + /// pair.align_offset(); + /// ``` + pub fn align_offset(&mut self) { + if let BaseType::Pair((ref first, ref mut second)) = self { + second.offset = first.cap as u32; } } } diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs index c0e2840..04ccfba 100644 --- a/seqkmer/src/seq.rs +++ b/seqkmer/src/seq.rs @@ -6,9 +6,9 @@ pub enum SeqFormat { } #[derive(Debug, Clone, PartialEq, Eq)] -pub enum BaseType { - Single(U), - Pair((U, U)), +pub enum BaseType { + Single(T), + Pair((T, T)), } impl BaseType { @@ -22,6 +22,33 @@ impl BaseType { BaseType::Pair((seq1, seq2)) => BaseType::Pair((func(seq1), func(seq2))), } } + + pub fn fold(&self, init: &mut V, mut func: F) -> BaseType + where + F: FnMut(&mut V, &T) -> U, + { + match self { + BaseType::Single(seq) => BaseType::Single(func(init, seq)), + BaseType::Pair((seq1, seq2)) => { + let res1 = func(init, seq1); + let res2 = func(init, seq2); + BaseType::Pair((res1, res2)) + } + } + } + + pub fn modify(&mut self, mut func: F) + where + F: FnMut(&mut T), + { + match self { + BaseType::Single(ref mut seq) => func(seq), + BaseType::Pair((ref mut seq1, ref mut seq2)) => { + func(seq1); + func(seq2); + } + } + } } impl BaseType> { From 18c0742b41091998ece85f5886707f5774ce69bc Mon Sep 17 00:00:00 2001 From: dagou Date: Sun, 16 Jun 2024 22:37:25 +0800 Subject: [PATCH 04/18] seq kmer --- kr2r/src/bin/classify.rs | 73 ++++++++++------------------------------ kr2r/src/bin/splitr.rs | 22 +++--------- kr2r/src/classify.rs | 49 +++++++++++++++++++++++---- kr2r/src/utils.rs | 73 +--------------------------------------- seqkmer/src/parallel.rs | 30 +++++++++++++++-- seqkmer/src/reader.rs | 42 +++++++++++++++++++++-- 6 files changed, 133 insertions(+), 156 deletions(-) diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index c6d6376..ebd5740 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -1,12 +1,10 @@ use clap::Parser; -use kr2r::classify::{adjust_hitlist_string, count_rows, resolve_tree, trim_pair_info}; +use kr2r::classify::{process_hitgroup, trim_pair_info}; use kr2r::compact_hash::{CHTable, Compact, HashConfig, Row}; use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; -use kr2r::utils::{ - create_sample_file, detect_file_format, find_and_sort_files, get_lastest_file_index, FileFormat, -}; +use kr2r::utils::{create_sample_file, find_and_sort_files, get_lastest_file_index}; use kr2r::IndexOptions; use std::collections::HashMap; use std::fs::File; @@ -16,10 +14,7 @@ use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Instant; -use seqkmer::{ - read_parallel, BaseType, FastaReader, FastqPairReader, FastqReader, HitGroup, Marker, Meros, - Reader, SeqMer, -}; +use seqkmer::{create_reader, read_parallel, BaseType, HitGroup, Marker, Meros, Reader, SeqMer}; #[derive(Parser, Debug, Clone)] #[clap( @@ -121,47 +116,27 @@ fn process_record( cur_taxon_counts: &TaxonCountersDash, classify_counter: &AtomicUsize, ) -> String { - let value_mask = hash_config.value_mask; - let seq_len_str = seq.fmt_cap(); let hits: BaseType> = seq .marker .apply(|marker| process_seq(&marker, &hash_config, chtable)); let total_kmers = seq.total_size(); - let (counts, cur_counts, hit_groups) = count_rows(&hits, value_mask); - let hit_string = adjust_hitlist_string(&hits, value_mask, taxonomy); - let mut call = resolve_tree(&counts, taxonomy, total_kmers, args.confidence_threshold); - if call > 0 && hit_groups < args.minimum_hit_groups { - call = 0; - }; - - cur_counts.iter().for_each(|entry| { - cur_taxon_counts - .entry(*entry.key()) - .or_default() - .merge(entry.value()) - .unwrap(); - }); - let ext_call = taxonomy.nodes[call as usize].external_id; - let clasify = if call > 0 { - classify_counter.fetch_add(1, Ordering::SeqCst); - cur_taxon_counts - .entry(call as u64) - .or_default() - .increment_read_count(); - - "C" - } else { - "U" - }; - // 使用锁来同步写入 - let output_line = format!( - "{}\t{}\t{}\t{}\t{}\n", - clasify, dna_id, ext_call, seq_len_str, hit_string + let hit_data = process_hitgroup( + &hits, + hash_config, + taxonomy, + cur_taxon_counts, + classify_counter, + total_kmers, + args.confidence_threshold, + args.minimum_hit_groups, ); - output_line + format!( + "{}\t{}\t{}\t{}\t{}\n", + hit_data.0, dna_id, hit_data.1, seq_len_str, hit_data.2 + ) } fn process_fastx_file( @@ -293,22 +268,8 @@ fn process_files( writeln!(file_writer, "{}\t{}", file_index, file_pair.join(","))?; file_writer.flush().unwrap(); - let mut files_iter = file_pair.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let file2 = files_iter.next().cloned(); let score = args.minimum_quality_score; - - let mut reader: Box = match detect_file_format(&file_pair[0])? { - FileFormat::Fastq => { - if let Some(file2) = file2 { - Box::new(FastqPairReader::from_path(file1, file2, file_index, score)?) - } else { - Box::new(FastqReader::from_path(file1, file_index, score)?) - } - } - FileFormat::Fasta => Box::new(FastaReader::from_path(file1, file_index)?), - }; - + let mut reader: Box = create_reader(file_pair, file_index, score)?; let (thread_sequences, thread_unclassified) = process_fastx_file( &args, meros, diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index fb0046d..db11ab1 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -1,11 +1,11 @@ use clap::Parser; use kr2r::compact_hash::{HashConfig, Slot}; use kr2r::utils::{ - create_partition_files, create_partition_writers, create_sample_file, detect_file_format, - get_file_limit, get_lastest_file_index, FileFormat, + create_partition_files, create_partition_writers, create_sample_file, get_file_limit, + get_lastest_file_index, }; use kr2r::IndexOptions; -use seqkmer::{read_parallel, FastaReader, FastqPairReader, FastqReader, Marker, Meros, Reader}; +use seqkmer::{create_reader, read_parallel, Marker, Meros, Reader}; use std::fs; use std::io::{BufWriter, Write}; use std::io::{Error, ErrorKind, Result}; @@ -212,22 +212,8 @@ fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { let mut sample_writer = create_sample_file(args.chunk_dir.join(format!("sample_id_{}.map", file_index))); - let mut files_iter = file_pair.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let file2 = files_iter.next().cloned(); let score = args.minimum_quality_score; - - let mut reader: Box = match detect_file_format(&file_pair[0])? { - FileFormat::Fastq => { - if let Some(file2) = file2 { - Box::new(FastqPairReader::from_path(file1, file2, file_index, score)?) - } else { - Box::new(FastqReader::from_path(file1, file_index, score)?) - } - } - FileFormat::Fasta => Box::new(FastaReader::from_path(file1, file_index)?), - }; - + let mut reader: Box = create_reader(file_pair, file_index, score)?; process_fastx_file( &args, meros, diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index e13cbe2..e52a2bd 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -1,9 +1,10 @@ use seqkmer::{BaseType, HitGroup}; -use crate::compact_hash::{Compact, Row}; +use crate::compact_hash::{Compact, HashConfig, Row}; use crate::readcounts::TaxonCountersDash; use crate::taxonomy::Taxonomy; use std::collections::HashMap; +use std::sync::atomic::{AtomicUsize, Ordering}; fn generate_hit_string( count: u32, @@ -214,7 +215,7 @@ fn gen_hit_string(hit: &HitGroup, taxonomy: &Taxonomy, value_mask: usize) - .join(" ") } -pub fn adjust_hitlist_string( +fn adjust_hitlist_string( hits: &BaseType>, value_mask: usize, taxonomy: &Taxonomy, @@ -226,15 +227,16 @@ pub fn adjust_hitlist_string( } } -pub fn count_rows( +fn count_rows( hit: &BaseType>, + cur_taxon_counts: &TaxonCountersDash, value_mask: usize, -) -> (HashMap, TaxonCountersDash, usize) { +) -> (HashMap, usize) { let mut counts = HashMap::new(); let mut hit_count: usize = 0; - let cur_taxon_counts = TaxonCountersDash::new(); + // let cur_taxon_counts = TaxonCountersDash::new(); hit.apply(|group| { for row in &group.rows { @@ -250,5 +252,40 @@ pub fn count_rows( } }); - (counts, cur_taxon_counts, hit_count) + (counts, hit_count) +} + +pub fn process_hitgroup( + hits: &BaseType>, + hash_config: &HashConfig, + taxonomy: &Taxonomy, + cur_taxon_counts: &TaxonCountersDash, + classify_counter: &AtomicUsize, + total_kmers: usize, + confidence_threshold: f64, + minimum_hit_groups: usize, +) -> (String, u64, String) { + let value_mask = hash_config.value_mask; + let (counts, hit_groups) = count_rows(&hits, cur_taxon_counts, value_mask); + let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold); + if call > 0 && hit_groups < minimum_hit_groups { + call = 0; + }; + + let ext_call = taxonomy.nodes[call as usize].external_id; + let clasify = if call > 0 { + classify_counter.fetch_add(1, Ordering::SeqCst); + cur_taxon_counts + .entry(call as u64) + .or_default() + .increment_read_count(); + + "C" + } else { + "U" + }; + + let hit_string = adjust_hitlist_string(&hits, value_mask, taxonomy); + + (clasify.to_owned(), ext_call, hit_string) } diff --git a/kr2r/src/utils.rs b/kr2r/src/utils.rs index f076344..a32a686 100644 --- a/kr2r/src/utils.rs +++ b/kr2r/src/utils.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::fs::{self, create_dir_all, File, OpenOptions}; -use std::io::{BufRead, BufReader, BufWriter, Result, Seek, Write}; +use std::io::{self, BufRead, BufReader, BufWriter, Result, Write}; use std::path::{Path, PathBuf}; use walkdir::WalkDir; @@ -142,77 +142,6 @@ pub fn format_bytes(size: f64) -> String { format!("{:.2}{}", size, current_suffix) } -#[derive(Debug)] -pub enum FileFormat { - Fasta, - Fastq, -} - -use flate2::read::GzDecoder; -use std::io::{self, Read}; - -pub fn is_gzipped(file: &mut File) -> io::Result { - let mut buffer = [0; 2]; - file.read_exact(&mut buffer)?; - file.rewind()?; // 重置文件指针到开头 - Ok(buffer == [0x1F, 0x8B]) -} - -pub fn detect_file_format>(path: P) -> io::Result { - let mut file = open_file(path)?; - let read1: Box = if is_gzipped(&mut file)? { - Box::new(GzDecoder::new(file)) - } else { - Box::new(file) - }; - - let reader = BufReader::new(read1); - let mut lines = reader.lines(); - - if let Some(first_line) = lines.next() { - let line = first_line?; - - if line.starts_with('>') { - return Ok(FileFormat::Fasta); - } else if line.starts_with('@') { - let _ = lines.next(); - if let Some(third_line) = lines.next() { - let line: String = third_line?; - if line.starts_with('+') { - return Ok(FileFormat::Fastq); - } - } - } else { - return Err(io::Error::new( - io::ErrorKind::Other, - "Unrecognized fasta(fastq) file format", - )); - } - } - - Err(io::Error::new( - io::ErrorKind::Other, - "Unrecognized fasta(fastq) file format", - )) - // let mut buffer = [0; 1]; // 仅分配一个字节的缓冲区 - - // // 读取文件的第一个字节 - // let bytes_read = reader.read(&mut buffer)?; - - // if bytes_read == 0 { - // return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "Empty file")); - // } - - // match buffer[0] { - // b'>' => Ok(FileFormat::Fasta), - // b'@' => Ok(FileFormat::Fastq), - // _ => Err(io::Error::new( - // io::ErrorKind::Other, - // "Unrecognized file format", - // )), - // } -} - #[cfg(unix)] extern crate libc; diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index a9497d8..3ccce75 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -1,6 +1,8 @@ -use crate::reader::{Reader, SeqMer}; +use crate::fasta::FastaReader; +use crate::fastq::{FastqPairReader, FastqReader}; +use crate::reader::{detect_file_format, Reader, SeqMer}; use crate::seq::Sequence; -use crate::Meros; +use crate::{Meros, SeqFormat}; use crossbeam_channel::{bounded, Receiver}; use scoped_threadpool::Pool; use std::io::Result; @@ -23,6 +25,30 @@ where } } +pub fn create_reader( + file_pair: &[String], + file_index: usize, + score: i32, +) -> Result> { + let mut files_iter = file_pair.iter(); + let file1 = files_iter.next().cloned().unwrap(); + let file2 = files_iter.next().cloned(); + match detect_file_format(&file_pair[0])? { + SeqFormat::Fastq => { + if let Some(file2) = file2 { + Ok(Box::new(FastqPairReader::from_path( + file1, file2, file_index, score, + )?)) + } else { + Ok(Box::new(FastqReader::from_path(file1, file_index, score)?)) + } + } + SeqFormat::Fasta => Ok(Box::new(FastaReader::from_path(file1, file_index)?)), + + _ => unreachable!(), + } +} + pub fn read_parallel( reader: &mut R, n_threads: usize, diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs index 1301f16..a74954c 100644 --- a/seqkmer/src/reader.rs +++ b/seqkmer/src/reader.rs @@ -1,8 +1,8 @@ -use crate::seq::{BaseType, Marker, Sequence}; +use crate::seq::{BaseType, Marker, SeqFormat, Sequence}; use crate::{mmscanner::MinimizerScanner, Meros}; use flate2::read::GzDecoder; use std::fs::File; -use std::io::{self, Read, Result, Seek}; +use std::io::{self, BufRead, BufReader, Read, Result, Seek}; use std::path::Path; pub fn dyn_reader>(path: P) -> Result> { @@ -32,6 +32,44 @@ pub fn open_file>(path: P) -> Result { }) } +pub fn detect_file_format>(path: P) -> io::Result { + let mut file = open_file(path)?; + let read1: Box = if is_gzipped(&mut file)? { + Box::new(GzDecoder::new(file)) + } else { + Box::new(file) + }; + + let reader = BufReader::new(read1); + let mut lines = reader.lines(); + + if let Some(first_line) = lines.next() { + let line = first_line?; + + if line.starts_with('>') { + return Ok(SeqFormat::Fasta); + } else if line.starts_with('@') { + let _ = lines.next(); + if let Some(third_line) = lines.next() { + let line: String = third_line?; + if line.starts_with('+') { + return Ok(SeqFormat::Fastq); + } + } + } else { + return Err(io::Error::new( + io::ErrorKind::Other, + "Unrecognized fasta(fastq) file format", + )); + } + } + + Err(io::Error::new( + io::ErrorKind::Other, + "Unrecognized fasta(fastq) file format", + )) +} + pub fn trim_end(buffer: &mut Vec) { while let Some(&b'\n' | &b'\r' | &b'>' | &b'@') = buffer.last() { buffer.pop(); From 03c7eac8bfc7962310f6385a10e1759893a4c868 Mon Sep 17 00:00:00 2001 From: dagou Date: Mon, 17 Jun 2024 14:18:01 +0800 Subject: [PATCH 05/18] seq kmer --- kr2r/src/bin/classify.rs | 3 +- kr2r/src/classify.rs | 130 ++++++++++++++++++--------------------- 2 files changed, 61 insertions(+), 72 deletions(-) diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index ebd5740..1604bde 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -6,6 +6,7 @@ use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{create_sample_file, find_and_sort_files, get_lastest_file_index}; use kr2r::IndexOptions; +use seqkmer::{create_reader, read_parallel, BaseType, HitGroup, Marker, Meros, Reader, SeqMer}; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -14,8 +15,6 @@ use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Instant; -use seqkmer::{create_reader, read_parallel, BaseType, HitGroup, Marker, Meros, Reader, SeqMer}; - #[derive(Parser, Debug, Clone)] #[clap( version, diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index e52a2bd..f57872d 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -1,8 +1,7 @@ -use seqkmer::{BaseType, HitGroup}; - use crate::compact_hash::{Compact, HashConfig, Row}; use crate::readcounts::TaxonCountersDash; use crate::taxonomy::Taxonomy; +use seqkmer::{BaseType, HitGroup}; use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -169,77 +168,23 @@ pub fn count_values( (counts, cur_taxon_counts, hit_count) } -fn gen_hit_string(hit: &HitGroup, taxonomy: &Taxonomy, value_mask: usize) -> String { - let mut result = Vec::new(); - let mut last_pos = 0; - let count = hit.cap as u32; - - for row in &hit.rows { - let adjusted_pos = row.kmer_id - hit.offset; - - let value = row.value; - let key = value.right(value_mask); - let ext_code = taxonomy.nodes[key as usize].external_id; - - if last_pos == 0 && adjusted_pos > 0 { - result.push((0, adjusted_pos)); // 在开始处添加0 - } else if adjusted_pos - last_pos > 1 { - result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 - } - if let Some(last) = result.last_mut() { - if last.0 == ext_code { - last.1 += 1; - last_pos = adjusted_pos; - continue; - } - } - - // 添加当前key的计数 - result.push((ext_code, 1)); - last_pos = adjusted_pos; - } - - // 填充尾随0 - if last_pos < count - 1 { - if last_pos == 0 { - result.push((0, count - last_pos)); - } else { - result.push((0, count - last_pos - 1)); - } - } - - result - .iter() - .map(|i| format!("{}:{}", i.0, i.1)) - .collect::>() - .join(" ") -} - -fn adjust_hitlist_string( +fn stat_hits( hits: &BaseType>, - value_mask: usize, - taxonomy: &Taxonomy, -) -> String { - let hit_str = hits.apply(|hit| gen_hit_string(hit, taxonomy, value_mask)); - match hit_str { - BaseType::Single(hit) => hit, - BaseType::Pair((hit1, hit2)) => format!("{} |:| {}", hit1, hit2), - } -} - -fn count_rows( - hit: &BaseType>, cur_taxon_counts: &TaxonCountersDash, + counts: &mut HashMap, value_mask: usize, -) -> (HashMap, usize) { - let mut counts = HashMap::new(); - + taxonomy: &Taxonomy, +) -> (usize, String) { + // let mut counts = HashMap::new(); let mut hit_count: usize = 0; - // let cur_taxon_counts = TaxonCountersDash::new(); + let hit_str = hits.apply(|group| { + let mut last_pos = 0; + let count = group.cap as u32; + let mut result = Vec::new(); - hit.apply(|group| { for row in &group.rows { + // 统计计数 let value = row.value; let key = value.right(value_mask); *counts.entry(key).or_insert(0) += 1; @@ -249,10 +194,52 @@ fn count_rows( .or_default() .add_kmer(value as u64); hit_count += 1; + + let adjusted_pos = row.kmer_id - group.offset; + + let value = row.value; + let key = value.right(value_mask); + let ext_code = taxonomy.nodes[key as usize].external_id; + + if last_pos == 0 && adjusted_pos > 0 { + result.push((0, adjusted_pos)); // 在开始处添加0 + } else if adjusted_pos - last_pos > 1 { + result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 + } + if let Some(last) = result.last_mut() { + if last.0 == ext_code { + last.1 += 1; + last_pos = adjusted_pos; + continue; + } + } + + // 添加当前key的计数 + result.push((ext_code, 1)); + last_pos = adjusted_pos; + } + + // 填充尾随0 + if last_pos < count - 1 { + if last_pos == 0 { + result.push((0, count - last_pos)); + } else { + result.push((0, count - last_pos - 1)); + } } + + result + .iter() + .map(|i| format!("{}:{}", i.0, i.1)) + .collect::>() + .join(" ") }); - (counts, hit_count) + let hit_string = match hit_str { + BaseType::Single(hit) => hit, + BaseType::Pair((hit1, hit2)) => format!("{} |:| {}", hit1, hit2), + }; + (hit_count, hit_string) } pub fn process_hitgroup( @@ -266,7 +253,12 @@ pub fn process_hitgroup( minimum_hit_groups: usize, ) -> (String, u64, String) { let value_mask = hash_config.value_mask; - let (counts, hit_groups) = count_rows(&hits, cur_taxon_counts, value_mask); + + let mut counts = HashMap::new(); + let (hit_groups, hit_string) = + stat_hits(hits, cur_taxon_counts, &mut counts, value_mask, taxonomy); + + // let (counts, hit_groups) = count_rows(&hits, cur_taxon_counts, value_mask); let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold); if call > 0 && hit_groups < minimum_hit_groups { call = 0; @@ -285,7 +277,5 @@ pub fn process_hitgroup( "U" }; - let hit_string = adjust_hitlist_string(&hits, value_mask, taxonomy); - (clasify.to_owned(), ext_call, hit_string) } From 0a5d9a28d65f96b21dcfb4c9a5c534807150980d Mon Sep 17 00:00:00 2001 From: dagou Date: Mon, 17 Jun 2024 19:11:18 +0800 Subject: [PATCH 06/18] seq kmer --- kr2r/src/classify.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index f57872d..31087e4 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -258,7 +258,6 @@ pub fn process_hitgroup( let (hit_groups, hit_string) = stat_hits(hits, cur_taxon_counts, &mut counts, value_mask, taxonomy); - // let (counts, hit_groups) = count_rows(&hits, cur_taxon_counts, value_mask); let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold); if call > 0 && hit_groups < minimum_hit_groups { call = 0; From f9eca43f0972e9a65e360f2985bdd8512bc20a06 Mon Sep 17 00:00:00 2001 From: dagou Date: Tue, 18 Jun 2024 19:42:31 +0800 Subject: [PATCH 07/18] seq kmer --- kr2r/src/bin/annotate.rs | 3 +- kr2r/src/bin/classify.rs | 65 +++++++------- kr2r/src/bin/estimate_capacity.rs | 22 +++-- kr2r/src/bin/resolve.rs | 3 +- kr2r/src/bin/splitr.rs | 36 +++++--- kr2r/src/classify.rs | 23 ++--- kr2r/src/compact_hash.rs | 17 +++- kr2r/src/db.rs | 48 ++++++---- seqkmer/src/fasta.rs | 14 +-- seqkmer/src/fastq.rs | 40 +++++---- seqkmer/src/lib.rs | 1 + seqkmer/src/mmscanner.rs | 145 ++++++++++++++++++++++-------- seqkmer/src/parallel.rs | 22 ++--- seqkmer/src/reader.rs | 104 ++++++++++----------- seqkmer/src/seq.rs | 116 ++++++++++++++---------- 15 files changed, 389 insertions(+), 270 deletions(-) diff --git a/kr2r/src/bin/annotate.rs b/kr2r/src/bin/annotate.rs index dd26710..b762f5f 100644 --- a/kr2r/src/bin/annotate.rs +++ b/kr2r/src/bin/annotate.rs @@ -128,7 +128,8 @@ where .into_par_iter() .filter_map(|slot| { let indx = slot.idx & idx_mask; - let taxid = chtm.get_from_page(indx, slot.value, page_index); + let compacted = slot.value.left(value_bits) as u32; + let taxid = chtm.get_from_page(indx, compacted, page_index); if taxid > 0 { let kmer_id = slot.idx >> idx_bits; diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index 1604bde..057e0ac 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -1,12 +1,14 @@ use clap::Parser; -use kr2r::classify::{process_hitgroup, trim_pair_info}; +use kr2r::classify::process_hitgroup; use kr2r::compact_hash::{CHTable, Compact, HashConfig, Row}; use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{create_sample_file, find_and_sort_files, get_lastest_file_index}; use kr2r::IndexOptions; -use seqkmer::{create_reader, read_parallel, BaseType, HitGroup, Marker, Meros, Reader, SeqMer}; +use seqkmer::{ + create_reader, read_parallel, BaseType, HitGroup, Meros, MinimizerIterator, Reader, SeqHeader, +}; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -37,7 +39,7 @@ pub struct Args { #[clap(short = 'S', long = "single-file-pairs", action)] pub single_file_pairs: bool, - /// Minimum quality score for FASTQ data, default is 0. + /// Minimum quality score for FASTQ data. #[clap( short = 'Q', long = "minimum-quality-score", @@ -46,7 +48,7 @@ pub struct Args { )] pub minimum_quality_score: i32, - /// Confidence score threshold, default is 0.0. + /// Confidence score threshold. #[clap( short = 'T', long = "confidence-threshold", @@ -72,7 +74,7 @@ pub struct Args { )] pub minimum_hit_groups: usize, - /// The number of threads to use, default is 10. + /// The number of threads to use. #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = 16)] pub num_threads: i32, @@ -85,29 +87,34 @@ pub struct Args { pub input_files: Vec, } -fn process_seq(marker: &Marker, hash_config: &HashConfig, chtable: &CHTable) -> HitGroup { +fn process_seq( + m_iter: &mut MinimizerIterator, + hash_config: &HashConfig, + chtable: &CHTable, +) -> HitGroup { let chunk_size = hash_config.hash_capacity; let value_bits = hash_config.value_bits; - let mut rows = Vec::new(); - for (sort, &hash_key) in marker.minimizer.iter().enumerate() { - let idx = hash_config.index(hash_key); + + let data: Vec<(usize, u64)> = m_iter.collect(); + for (sort, hash_key) in data { + let (idx, compacted) = hash_config.compact(hash_key); let partition_index = idx / chunk_size; let index = idx % chunk_size; - let taxid = chtable.get_from_page(index, hash_key, partition_index + 1); + + let taxid = chtable.get_from_page(index, compacted, partition_index + 1); if taxid > 0 { - let compacted_key = hash_key.left(value_bits) as u32; - let high = u32::combined(compacted_key, taxid, value_bits); + let high = u32::combined(compacted, taxid, value_bits); let row = Row::new(high, 0, sort as u32 + 1); rows.push(row); } } - HitGroup::new(marker.size(), rows, 0) + + HitGroup::new(m_iter.size, rows, 0) } fn process_record( - dna_id: String, - seq: &SeqMer, + marker: &mut BaseType, args: &Args, taxonomy: &Taxonomy, chtable: &CHTable, @@ -115,12 +122,10 @@ fn process_record( cur_taxon_counts: &TaxonCountersDash, classify_counter: &AtomicUsize, ) -> String { - let seq_len_str = seq.fmt_cap(); - let hits: BaseType> = seq - .marker - .apply(|marker| process_seq(&marker, &hash_config, chtable)); - - let total_kmers = seq.total_size(); + let id = marker.get_s().clone().id; + let hits = marker.apply_mut(|_, m_iter| process_seq(m_iter, &hash_config, chtable)); + let total_kmers = hits.total_marker_size(); + let seq_len_str = marker.fmt_seq_size(); let hit_data = process_hitgroup( &hits, @@ -134,7 +139,7 @@ fn process_record( ); format!( "{}\t{}\t{}\t{}\t{}\n", - hit_data.0, dna_id, hit_data.1, seq_len_str, hit_data.2 + hit_data.0, id, hit_data.1, seq_len_str, hit_data.2 ) } @@ -162,23 +167,20 @@ where let cur_taxon_counts = TaxonCountersDash::new(); - let sequence_count = AtomicUsize::new(0); + let seq_counter = AtomicUsize::new(0); let classify_counter = AtomicUsize::new(0); let _ = read_parallel( reader, args.num_threads as usize - 2, args.num_threads as usize, - meros, + &meros, |seqs| { let mut buffer = String::new(); - for seq in seqs { - let dna_id = trim_pair_info(&seq.id); - sequence_count.fetch_add(1, Ordering::SeqCst); - + for record in seqs { + seq_counter.fetch_add(1, Ordering::SeqCst); let output_line = process_record( - dna_id, - &seq, + record, args, taxonomy, chtable, @@ -186,7 +188,6 @@ where &cur_taxon_counts, &classify_counter, ); - buffer.push_str(&output_line); } @@ -218,7 +219,7 @@ where .unwrap(); }); - let thread_sequences = sequence_count.load(Ordering::SeqCst); + let thread_sequences = seq_counter.load(Ordering::SeqCst); let thread_classified = classify_counter.load(Ordering::SeqCst); if let Some(output) = &args.kraken_output_dir { let filename = output.join(format!("output_{}.kreport2", file_index)); diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs index c27342d..e046b33 100644 --- a/kr2r/src/bin/estimate_capacity.rs +++ b/kr2r/src/bin/estimate_capacity.rs @@ -89,21 +89,19 @@ fn process_sequence( &mut reader, args.threads, args.threads - 2, - meros, + &meros, |record_set| { let mut minimizer_set = HashSet::new(); - for record in record_set.into_iter() { - record - .marker - .fold(&mut minimizer_set, |minimizer_set, marker| { - let kmer_iter = marker - .minimizer - .iter() - .filter(|&hash_key| hash_key & RANGE_MASK < range_n); - - minimizer_set.extend(kmer_iter); - }); + for record in record_set { + record.fold(&mut minimizer_set, |minimizer_set, _, m_iter| { + let kmer_iter: HashSet = m_iter + .filter(|(_, hash_key)| *hash_key & RANGE_MASK < range_n) + .map(|(_, hash_key)| hash_key) + .collect(); + + minimizer_set.extend(kmer_iter); + }); } Some(minimizer_set) }, diff --git a/kr2r/src/bin/resolve.rs b/kr2r/src/bin/resolve.rs index a22f559..5fd79d6 100644 --- a/kr2r/src/bin/resolve.rs +++ b/kr2r/src/bin/resolve.rs @@ -1,12 +1,13 @@ use clap::Parser; use dashmap::{DashMap, DashSet}; -use kr2r::classify::{add_hitlist_string, count_values, resolve_tree, trim_pair_info}; +use kr2r::classify::{add_hitlist_string, count_values, resolve_tree}; use kr2r::compact_hash::{HashConfig, Row}; use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{find_and_sort_files, open_file}; use rayon::prelude::*; +use seqkmer::trim_pair_info; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}; diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index db11ab1..e5b6d02 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -5,7 +5,7 @@ use kr2r::utils::{ get_lastest_file_index, }; use kr2r::IndexOptions; -use seqkmer::{create_reader, read_parallel, Marker, Meros, Reader}; +use seqkmer::{create_reader, read_parallel, Meros, MinimizerIterator, Reader}; use std::fs; use std::io::{BufWriter, Write}; use std::io::{Error, ErrorKind, Result}; @@ -96,14 +96,14 @@ fn init_chunk_writers( /// 处理record fn process_record( k2_slot_list: &mut Vec<(usize, Slot)>, - marker: &Marker, + marker: &mut MinimizerIterator, hash_config: &HashConfig, chunk_size: usize, seq_id: u64, idx_bits: usize, ) { let offset = k2_slot_list.len(); - for (sort, &hash_key) in marker.minimizer.iter().enumerate() { + for (sort, hash_key) in marker { let mut slot = hash_config.slot_u64(hash_key, seq_id); let seq_sort = sort + offset; let partition_index = slot.idx / chunk_size; @@ -150,24 +150,32 @@ where reader, args.num_threads as usize - 2, args.num_threads as usize, - meros, + &meros, |seqs| { let mut buffer = String::new(); let mut k2_slot_list = Vec::new(); - for seq in &seqs { - let dna_id = seq.id.to_owned(); - let index = seq.reads_index; - let seq_id = (file_index << 32 | index) as u64; + for seq in seqs { let mut init: Vec<(usize, Slot)> = Vec::new(); - seq.marker.fold(&mut init, |init, marker| { - process_record(init, marker, &hash_config, chunk_size, seq_id, idx_bits) + let header = seq.get_s(); + let index = header.reads_index; + let dna_id = header.id.clone(); + seq.fold(&mut init, |init, _, mut m_iter| { + let seq_id = (file_index << 32 | index) as u64; + process_record( + init, + &mut m_iter, + &hash_config, + chunk_size, + seq_id, + idx_bits, + ); }); - k2_slot_list.extend(init); + k2_slot_list.extend_from_slice(&init); - let seq_cap_str = seq.fmt_cap(); - let seq_size_str = seq.fmt_size(); + let size_str = seq.fmt_size(); + let seq_size_str = seq.fmt_seq_size(); buffer.push_str( - format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_cap_str, seq_size_str).as_str(), + format!("{}\t{}\t{}\t{}\n", index, dna_id, seq_size_str, size_str).as_str(), ); } Some((buffer, k2_slot_list)) diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index 31087e4..2b6e4cd 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -59,17 +59,6 @@ fn generate_hit_string( .join(" ") } -pub fn trim_pair_info(id: &str) -> String { - let sz = id.len(); - if sz <= 2 { - return id.to_string(); - } - if id.ends_with("/1") || id.ends_with("/2") { - return id[0..sz - 2].to_string(); - } - id.to_string() -} - // &HashMap, pub fn resolve_tree( hit_counts: &HashMap, @@ -169,7 +158,7 @@ pub fn count_values( } fn stat_hits( - hits: &BaseType>, + hits: &BaseType<(), HitGroup>, cur_taxon_counts: &TaxonCountersDash, counts: &mut HashMap, value_mask: usize, @@ -178,9 +167,9 @@ fn stat_hits( // let mut counts = HashMap::new(); let mut hit_count: usize = 0; - let hit_str = hits.apply(|group| { + let hit_str = hits.apply(|_, group| { let mut last_pos = 0; - let count = group.cap as u32; + let count = group.marker_size as u32; let mut result = Vec::new(); for row in &group.rows { @@ -236,14 +225,14 @@ fn stat_hits( }); let hit_string = match hit_str { - BaseType::Single(hit) => hit, - BaseType::Pair((hit1, hit2)) => format!("{} |:| {}", hit1, hit2), + BaseType::Single(_, hit) => hit, + BaseType::Pair(_, hit1, hit2) => format!("{} |:| {}", hit1, hit2), }; (hit_count, hit_string) } pub fn process_hitgroup( - hits: &BaseType>, + hits: &BaseType<(), HitGroup>, hash_config: &HashConfig, taxonomy: &Taxonomy, cur_taxon_counts: &TaxonCountersDash, diff --git a/kr2r/src/compact_hash.rs b/kr2r/src/compact_hash.rs index d7538e0..c4a87f1 100644 --- a/kr2r/src/compact_hash.rs +++ b/kr2r/src/compact_hash.rs @@ -256,6 +256,10 @@ impl HashConfig { hash_key as usize % self.capacity } + pub fn compact(&self, hash_key: u64) -> (usize, u32) { + (self.index(hash_key), hash_key.left(self.value_bits) as u32) + } + pub fn slot(&self, hash_key: u64, taxid: u32) -> Slot { let idx = self.index(hash_key); Slot::::new(idx, u32::hash_value(hash_key, self.value_bits, taxid)) @@ -370,11 +374,11 @@ impl Page { pub fn find_index( &self, index: usize, - value: u64, + compacted_key: u32, value_bits: usize, value_mask: usize, ) -> u32 { - let compacted_key = value.left(value_bits) as u32; + // let compacted_key = value.left(value_bits) as u32; let mut idx = index; if idx > self.size { return u32::default(); @@ -464,9 +468,14 @@ impl CHTable { Ok(chtm) } - pub fn get_from_page(&self, indx: usize, value: u64, page_index: usize) -> u32 { + pub fn get_from_page(&self, indx: usize, compacted: u32, page_index: usize) -> u32 { if let Some(page) = self.pages.get(page_index) { - page.find_index(indx, value, self.config.value_bits, self.config.value_mask) + page.find_index( + indx, + compacted, + self.config.value_bits, + self.config.value_mask, + ) } else { 0 } diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index cc52309..6f28bf6 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -7,9 +7,7 @@ use seqkmer::Meros; use crate::utils::open_file; use byteorder::{LittleEndian, WriteBytesExt}; use rayon::prelude::*; -// use seq_io::fasta::{Reader, Record}; -// use seq_io::parallel::read_parallel; -use seqkmer::{read_parallel as s_parallel, FastaReader}; +use seqkmer::{read_parallel, FastaReader}; use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, BufWriter, Read, Result as IOResult, Write}; @@ -258,30 +256,48 @@ pub fn convert_fna_to_k2_format>( let value_bits = hash_config.value_bits; let cell_size = std::mem::size_of::>(); - s_parallel( + read_parallel( &mut reader, threads as usize, queue_len, - meros, + &meros, |seqs| { let mut k2_cell_list = Vec::new(); - for record in seqs.iter() { - if let Some(ext_taxid) = id_to_taxon_map.get(&record.id) { - let taxid = taxonomy.get_internal_id(*ext_taxid); - record - .marker - .fold(&mut k2_cell_list, |k2_cell_list, marker| { - for &hash_key in marker.minimizer.iter() { + for record in seqs { + record.fold(&mut k2_cell_list, |k2_cell_list, header, m_iter| { + if let Some(ext_taxid) = id_to_taxon_map.get(&header.id) { + let taxid = taxonomy.get_internal_id(*ext_taxid); + let k2_cell: Vec<(usize, Slot)> = m_iter + .map(|(_, hash_key)| { let index: usize = hash_config.index(hash_key); let idx = index % chunk_size; let partition_index = index / chunk_size; let cell = Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); - k2_cell_list.push((partition_index, cell)); - } - }); - } + (partition_index, cell) + }) + .collect(); + + k2_cell_list.extend_from_slice(&k2_cell); + } + }); + + // if let Some(ext_taxid) = id_to_taxon_map.get(&record.id) { + // let taxid = taxonomy.get_internal_id(*ext_taxid); + // record + // .marker + // .fold(&mut k2_cell_list, |k2_cell_list, marker| { + // for &hash_key in marker.minimizer.iter() { + // let index: usize = hash_config.index(hash_key); + // let idx = index % chunk_size; + // let partition_index = index / chunk_size; + // let cell = + // Slot::new(idx, u32::hash_value(hash_key, value_bits, taxid)); + // k2_cell_list.push((partition_index, cell)); + // } + // }); + // } } Some(k2_cell_list) }, diff --git a/seqkmer/src/fasta.rs b/seqkmer/src/fasta.rs index aa0e173..9bbbf33 100644 --- a/seqkmer/src/fasta.rs +++ b/seqkmer/src/fasta.rs @@ -1,5 +1,5 @@ -use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; -use crate::seq::{BaseType, SeqFormat, Sequence}; +use crate::reader::{dyn_reader, trim_end, Reader, SeqVecType, BUFSIZE}; +use crate::seq::{BaseType, SeqFormat, SeqHeader}; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; @@ -65,7 +65,7 @@ fn check_sequence_length(seq: &Vec) -> bool { } impl Reader for FastaReader { - fn next(&mut self) -> Result>> { + fn next(&mut self) -> Result> { if self.read_next()?.is_none() { return Ok(None); } @@ -94,13 +94,13 @@ impl Reader for FastaReader { }; self.reads_index += 1; - let sequence = Sequence { + let seq_header = SeqHeader { file_index: self.file_index, reads_index: self.reads_index, - id: seq_id.to_owned(), - seq: BaseType::Single(self.seq.to_owned()), format: SeqFormat::Fasta, + id: seq_id.to_owned(), }; - Ok(Some(vec![sequence])) + let seq = BaseType::Single(seq_header, self.seq.to_owned()); + Ok(Some(vec![seq])) } } diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs index ab61784..46e19b7 100644 --- a/seqkmer/src/fastq.rs +++ b/seqkmer/src/fastq.rs @@ -1,8 +1,10 @@ -use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; -use crate::seq::{BaseType, SeqFormat, Sequence}; +use crate::reader::{dyn_reader, trim_end, trim_pair_info, Reader, SeqVecType, BUFSIZE}; +use crate::seq::{BaseType, SeqFormat, SeqHeader}; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; +type SeqType = BaseType>; + struct QReader { reader: BufReader, quality_score: i32, @@ -100,7 +102,7 @@ where } } - pub fn read_next(&mut self) -> Result> { + pub fn read_next(&mut self) -> Result> { if self.inner.read_next()?.is_none() { return Ok(None); } @@ -118,14 +120,15 @@ where }; self.reads_index += 1; - let sequence = Sequence { + let seq_header = SeqHeader { file_index: self.file_index, reads_index: self.reads_index, + format: SeqFormat::Fasta, id: seq_id.to_owned(), - seq: BaseType::Single(self.inner.seq.to_owned()), - format: SeqFormat::Fastq, }; - Ok(Some(sequence)) + + let seq = BaseType::Single(seq_header, self.inner.seq.to_owned()); + Ok(Some(seq)) } } @@ -145,8 +148,8 @@ impl Reader for FastqReader where R: Read + Send, { - fn next(&mut self) -> Result>> { - let seqs: Vec = (0..self.batch_size) + fn next(&mut self) -> Result> { + let seqs: SeqVecType = (0..self.batch_size) .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> .collect::>>()?; @@ -190,7 +193,7 @@ where } } - pub fn read_next(&mut self) -> Result> { + pub fn read_next(&mut self) -> Result> { if self.inner1.read_next()?.is_none() { return Ok(None); } @@ -212,13 +215,18 @@ where }; self.reads_index += 1; - let sequence = Sequence { + let seq_header = SeqHeader { file_index: self.file_index, reads_index: self.reads_index, - id: seq_id.to_owned(), - seq: BaseType::Pair((self.inner1.seq.to_owned(), self.inner2.seq.to_owned())), - format: SeqFormat::PairFastq, + format: SeqFormat::Fasta, + id: trim_pair_info(seq_id), }; + + let sequence = BaseType::Pair( + seq_header, + self.inner1.seq.to_owned(), + self.inner2.seq.to_owned(), + ); Ok(Some(sequence)) } } @@ -241,8 +249,8 @@ impl Reader for FastqPairReader where R: Read + Send, { - fn next(&mut self) -> Result>> { - let seqs: Vec = (0..self.batch_size) + fn next(&mut self) -> Result> { + let seqs: SeqVecType = (0..self.batch_size) .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> .collect::>>()?; diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs index 7eab6e2..3c9350c 100644 --- a/seqkmer/src/lib.rs +++ b/seqkmer/src/lib.rs @@ -10,6 +10,7 @@ pub use fasta::*; pub use fastq::*; pub use feat::constants::*; pub use feat::*; +pub use mmscanner::MinimizerIterator; pub use parallel::*; pub use reader::*; pub use seq::*; diff --git a/seqkmer/src/mmscanner.rs b/seqkmer/src/mmscanner.rs index 8fe7498..9775f9a 100644 --- a/seqkmer/src/mmscanner.rs +++ b/seqkmer/src/mmscanner.rs @@ -1,5 +1,5 @@ // kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash -use crate::seq::{BaseType, Marker}; +use crate::seq::{BaseType, SeqHeader}; use crate::{ canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, }; @@ -15,7 +15,7 @@ fn to_candidate_lmer(meros: &Meros, lmer: u64) -> u64 { } #[derive(Debug)] -struct MinimizerData { +pub struct MinimizerData { pos: usize, candidate_lmer: u64, } @@ -98,7 +98,8 @@ impl MinimizerWindow { } } -struct Cursor { +#[derive(Clone, Copy)] +pub struct Cursor { pos: usize, capacity: usize, value: u64, @@ -134,56 +135,128 @@ impl Cursor { } } -pub struct MinimizerScanner<'a> { - seq: &'a BaseType>, - meros: Meros, +pub struct MinimizerIterator<'a> { cursor: Cursor, window: MinimizerWindow, + seq: &'a [u8], + meros: &'a Meros, + pos: usize, + end: usize, + pub size: usize, } -impl<'a> MinimizerScanner<'a> { - pub fn new(seq: &'a BaseType>, meros: Meros) -> Self { - MinimizerScanner { +impl<'a> MinimizerIterator<'a> { + pub fn new(seq: &'a [u8], cursor: Cursor, window: MinimizerWindow, meros: &'a Meros) -> Self { + MinimizerIterator { + cursor, + window, seq, meros, - cursor: Cursor::new(meros.l_mer, meros.mask), - window: MinimizerWindow::new(meros.window_size()), + pos: 0, + size: 0, + end: seq.len(), } } - #[inline] - fn clear(&mut self) { + fn clear_state(&mut self) { self.cursor.clear(); self.window.clear(); } - fn iter_seq(&mut self, seq: &Vec) -> Marker { - let minimizer = seq - .iter() - .filter_map(|&ch| { - if ch == b'\n' || ch == b'\r' { - None - } else { - match char_to_value(ch) { - Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { - let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); - self.window - .next(candidate_lmer) - .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) - }), - None => { - self.clear(); - None - } + pub fn seq_size(&self) -> usize { + self.end + } +} + +impl<'a> Iterator for MinimizerIterator<'a> { + type Item = (usize, u64); + + fn next(&mut self) -> Option { + // self.sequence + // .iter() + // .filter_map(|&ch| { + // if ch == b'\n' || ch == b'\r' { + // None + // } else { + // match char_to_value(ch) { + // Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { + // let candidate_lmer: u64 = to_candidate_lmer(&self.meros, lmer); + // self.window + // .next(candidate_lmer) + // .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) + // }), + // None => { + // self.clear_state(); + // None + // } + // } + // } + // }) + // .next() + while self.pos < self.end { + let ch = self.seq[self.pos]; + self.pos += 1; + if ch == b'\n' || ch == b'\r' { + continue; + } else { + let data = match char_to_value(ch) { + Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { + let candidate_lmer = to_candidate_lmer(&self.meros, lmer); + self.window + .next(candidate_lmer) + .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) + }), + None => { + self.clear_state(); + None } + }; + if data.is_some() { + self.size += 1; + return Some((self.size, data.unwrap())); } - }) - .collect(); + } + } + None + } +} - Marker::new(seq.len(), minimizer) +impl<'a> BaseType> { + pub fn seq_size_str(&self) -> BaseType<(), String> { + self.apply(|_, m_iter| m_iter.seq_size().to_string()) + } + + pub fn fmt_seq_size(&self) -> String { + match &self { + BaseType::Single(_, m_iter) => m_iter.seq_size().to_string(), + BaseType::Pair(_, m_iter1, m_iter2) => { + format!("{}|{}", m_iter1.seq_size(), m_iter2.seq_size()) + } + } } - pub fn iter(&mut self) -> BaseType { - self.seq.apply(|seq| self.iter_seq(seq)) + pub fn fmt_size(&self) -> String { + match &self { + BaseType::Single(_, m_iter) => m_iter.size.to_string(), + BaseType::Pair(_, m_iter1, m_iter2) => { + format!("{}|{}", m_iter1.size, m_iter2.size) + } + } + } +} +pub fn scan_sequence<'a>( + sequence: &'a BaseType>, + meros: &'a Meros, +) -> BaseType> { + let func = |seq: &'a Vec| { + let cursor = Cursor::new(meros.l_mer, meros.mask); + let window = MinimizerWindow::new(meros.window_size()); + MinimizerIterator::new(&seq, cursor, window, meros) + }; + match sequence { + BaseType::Single(header, seq) => BaseType::Single(header.clone(), func(seq)), + BaseType::Pair(header, seq1, seq2) => { + BaseType::Pair(header.clone(), func(seq1), func(seq2)) + } } } diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index 3ccce75..b54249e 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -1,7 +1,8 @@ use crate::fasta::FastaReader; use crate::fastq::{FastqPairReader, FastqReader}; -use crate::reader::{detect_file_format, Reader, SeqMer}; -use crate::seq::Sequence; +use crate::mmscanner::{scan_sequence, MinimizerIterator}; +use crate::reader::{detect_file_format, Reader}; +use crate::seq::{BaseType, SeqHeader}; use crate::{Meros, SeqFormat}; use crossbeam_channel::{bounded, Receiver}; use scoped_threadpool::Pool; @@ -53,7 +54,7 @@ pub fn read_parallel( reader: &mut R, n_threads: usize, buffer_len: usize, - meros: Meros, + meros: &Meros, work: W, func: F, ) -> Result<()> @@ -61,12 +62,12 @@ where R: Reader, O: Send, Out: Send + Default, - W: Send + Sync + Fn(Vec) -> Option, + W: Send + Sync + Fn(&mut Vec>) -> Option, F: FnOnce(&mut ParallelResult>) -> Out + Send, { assert!(n_threads > 2); assert!(n_threads <= buffer_len); - let (sender, receiver) = bounded::>(buffer_len); + let (sender, receiver) = bounded::>>>(buffer_len); let (done_send, done_recv) = bounded::>(buffer_len); let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver let done_send = Arc::new(done_send); @@ -88,13 +89,12 @@ where let work = &work; let done_send = Arc::clone(&done_send); pool_scope.execute(move || { - while let Ok(seqs) = receiver.recv() { - let seq_mers: Vec = seqs - .iter() - .map(|seq| SeqMer::from_seq(seq, meros)) + while let Ok(mut seqs) = receiver.recv() { + let mut markers: Vec>> = seqs + .iter_mut() + .map(|seq| scan_sequence(seq, &meros)) .collect(); - - let output = work(seq_mers); + let output = work(&mut markers); done_send.send(output).expect("Failed to send outputs"); } }); diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs index a74954c..d22321f 100644 --- a/seqkmer/src/reader.rs +++ b/seqkmer/src/reader.rs @@ -1,5 +1,4 @@ -use crate::seq::{BaseType, Marker, SeqFormat, Sequence}; -use crate::{mmscanner::MinimizerScanner, Meros}; +use crate::seq::{BaseType, SeqFormat, SeqHeader}; use flate2::read::GzDecoder; use std::fs::File; use std::io::{self, BufRead, BufReader, Read, Result, Seek}; @@ -22,6 +21,17 @@ pub fn is_gzipped(file: &mut File) -> Result { Ok(buffer == [0x1F, 0x8B]) } +pub fn trim_pair_info(id: &str) -> String { + let sz = id.len(); + if sz <= 2 { + return id.to_string(); + } + if id.ends_with("/1") || id.ends_with("/2") { + return id[0..sz - 2].to_string(); + } + id.to_string() +} + pub fn open_file>(path: P) -> Result { File::open(&path).map_err(|e| { if e.kind() == io::ErrorKind::NotFound { @@ -78,66 +88,22 @@ pub fn trim_end(buffer: &mut Vec) { pub const BUFSIZE: usize = 16 * 1024 * 1024; +pub type SeqVecType = Vec>>; + pub trait Reader: Send { - fn next(&mut self) -> Result>>; + fn next(&mut self) -> Result>; } impl Reader for Box { - fn next(&mut self) -> Result>> { + fn next(&mut self) -> Result> { (**self).next() } } -#[derive(Debug, Clone)] -pub struct SeqMer { - pub id: String, - pub file_index: usize, - pub reads_index: usize, - pub marker: BaseType, -} - -impl SeqMer { - pub fn from_seq(seq: &Sequence, meros: Meros) -> Self { - let mut ms = MinimizerScanner::new(&seq.seq, meros); - let marker = ms.iter(); - Self { - marker, - id: seq.id.clone(), - file_index: seq.file_index, - reads_index: seq.reads_index, - } - } - - pub fn cap_str(&self) -> BaseType { - self.marker.apply(|marker| marker.cap.to_string()) - } - - pub fn total_size(&self) -> usize { - match &self.marker { - BaseType::Single(marker1) => marker1.size(), - BaseType::Pair((marker1, marker2)) => marker1.size() + marker2.size(), - } - } - - pub fn fmt_cap(&self) -> String { - match &self.marker { - BaseType::Single(marker1) => marker1.cap.to_string(), - BaseType::Pair((marker1, marker2)) => format!("{}|{}", marker1.cap, marker2.cap), - } - } - - pub fn fmt_size(&self) -> String { - match &self.marker { - BaseType::Single(marker1) => marker1.size().to_string(), - BaseType::Pair((marker1, marker2)) => format!("{}|{}", marker1.size(), marker2.size()), - } - } -} - #[derive(Debug)] pub struct HitGroup { - /// minimizer capacity - pub cap: usize, + /// minimizer data size + pub marker_size: usize, /// hit value vector pub rows: Vec, /// pair offset @@ -145,12 +111,16 @@ pub struct HitGroup { } impl HitGroup { - pub fn new(cap: usize, rows: Vec, offset: u32) -> Self { - Self { cap, rows, offset } + pub fn new(marker_size: usize, rows: Vec, offset: u32) -> Self { + Self { + marker_size, + rows, + offset, + } } } -impl BaseType> { +impl BaseType> { /// Synchronizes the offset of the second element of a `Pair` to the `cap` of the first element. /// This alignment is only necessary when the `rows` property of the `HitGroup` is in an /// increasing order. If `rows` is not increasing, aligning the offset based on `cap` may not @@ -166,8 +136,28 @@ impl BaseType> { /// pair.align_offset(); /// ``` pub fn align_offset(&mut self) { - if let BaseType::Pair((ref first, ref mut second)) = self { - second.offset = first.cap as u32; + if let BaseType::Pair(_, ref first, ref mut second) = self { + second.offset = first.marker_size as u32; } } + + pub fn total_marker_size(&self) -> usize { + match &self { + BaseType::Single(_, hit) => hit.marker_size, + BaseType::Pair(_, hit1, hit2) => hit1.marker_size + hit2.marker_size, + } + } + + // pub fn seq_size_str(&self) -> BaseType<(), String> { + // self.apply(|_, hit| hit.seq_size.to_string()) + // } + + // pub fn fmt_seq_size(&self) -> String { + // match &self { + // BaseType::Single(_, hit) => hit.seq_size.to_string(), + // BaseType::Pair(_, hit1, hit2) => { + // format!("{}|{}", hit1.seq_size, hit2.seq_size) + // } + // } + // } } diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs index 04ccfba..d17a7dd 100644 --- a/seqkmer/src/seq.rs +++ b/seqkmer/src/seq.rs @@ -6,78 +6,102 @@ pub enum SeqFormat { } #[derive(Debug, Clone, PartialEq, Eq)] -pub enum BaseType { - Single(T), - Pair((T, T)), +pub struct SeqHeader { + pub id: String, + pub file_index: usize, + pub reads_index: usize, + pub format: SeqFormat, } -impl BaseType { +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BaseType { + Single(S, T), + Pair(S, T, T), +} + +impl BaseType { // 泛型方法,根据序列类型执行操作 - pub fn apply(&self, mut func: F) -> BaseType + pub fn apply<'a, U, F>(&'a self, mut func: F) -> BaseType<(), U> where - F: FnMut(&T) -> U, + F: FnMut(&'a S, &'a T) -> U, { match self { - BaseType::Single(seq) => BaseType::Single(func(seq)), - BaseType::Pair((seq1, seq2)) => BaseType::Pair((func(seq1), func(seq2))), + BaseType::Single(prop, seq) => BaseType::Single((), func(prop, seq)), + BaseType::Pair(prop, seq1, seq2) => { + BaseType::Pair((), func(prop, seq1), func(prop, seq2)) + } } } - pub fn fold(&self, init: &mut V, mut func: F) -> BaseType + pub fn apply_mut<'a, U, F>(&'a mut self, mut func: F) -> BaseType<(), U> where - F: FnMut(&mut V, &T) -> U, + F: FnMut(&'a S, &'a mut T) -> U, { match self { - BaseType::Single(seq) => BaseType::Single(func(init, seq)), - BaseType::Pair((seq1, seq2)) => { - let res1 = func(init, seq1); - let res2 = func(init, seq2); - BaseType::Pair((res1, res2)) + BaseType::Single(prop, seq) => BaseType::Single((), func(prop, seq)), + BaseType::Pair(prop, seq1, seq2) => { + BaseType::Pair((), func(prop, seq1), func(prop, seq2)) } } } - pub fn modify(&mut self, mut func: F) + pub fn get_s(&self) -> &S { + match self { + BaseType::Single(prop, _) => prop, + BaseType::Pair(prop, _, _) => prop, + } + } + + pub fn transform<'a, U, F, V>(&mut self, mut func: F) -> BaseType where - F: FnMut(&mut T), + F: for<'b> FnMut(&S, &'b mut T) -> (V, U), { match self { - BaseType::Single(ref mut seq) => func(seq), - BaseType::Pair((ref mut seq1, ref mut seq2)) => { - func(seq1); - func(seq2); + BaseType::Single(prop, seq) => { + let res1 = func(prop, seq); + BaseType::Single(res1.0, res1.1) + } + BaseType::Pair(prop, seq1, seq2) => { + let res1 = func(prop, seq1); + let res2 = func(prop, seq2); + BaseType::Pair(res1.0, res1.1, res2.1) } } } -} -impl BaseType> { - pub fn len(&self) -> BaseType { - self.apply(|seq| seq.len()) - } -} - -#[derive(Debug, Clone)] -pub struct Marker { - pub cap: usize, - pub minimizer: Vec, -} - -impl Marker { - pub fn new(cap: usize, minimizer: Vec) -> Self { - Self { cap, minimizer } + pub fn fold(&mut self, init: &mut V, mut func: F) -> BaseType<(), U> + where + F: FnMut(&mut V, &S, &mut T) -> U, + { + match self { + BaseType::Single(prop, seq) => BaseType::Single((), func(init, prop, seq)), + BaseType::Pair(prop, ref mut seq1, ref mut seq2) => { + let res1 = func(init, prop, seq1); + let res2 = func(init, prop, seq2); + BaseType::Pair((), res1, res2) + } + } } - pub fn size(&self) -> usize { - self.minimizer.len() + pub fn modify(&mut self, mut func: F) + where + F: FnMut(&S, &mut T), + { + match self { + BaseType::Single(prop, ref mut seq) => func(prop, seq), + BaseType::Pair(prop, ref mut seq1, ref mut seq2) => { + func(prop, seq1); + func(prop, seq2); + } + } } } -#[derive(Debug, Clone)] -pub struct Sequence { - pub file_index: usize, - pub reads_index: usize, - pub id: String, - pub seq: BaseType>, - pub format: SeqFormat, +impl BaseType> +where + S: Copy, +{ + pub fn len(&self) -> BaseType<(), usize> { + self.apply(|_, seq| seq.len()) + } } From b48a1a1ec56cea28e969a35af85365ae3303800a Mon Sep 17 00:00:00 2001 From: dagou Date: Wed, 19 Jun 2024 16:01:21 +0800 Subject: [PATCH 08/18] seq kmer --- seqkmer/src/fastq.rs | 123 ++++++++++++++++++++++++++++++++++++++++++ seqkmer/src/lib.rs | 1 + seqkmer/src/reader.rs | 21 ++------ seqkmer/src/seq.rs | 43 ++++++++++++++- seqkmer/src/utils.rs | 41 ++++++++++++++ 5 files changed, 211 insertions(+), 18 deletions(-) create mode 100644 seqkmer/src/utils.rs diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs index 46e19b7..ed035f7 100644 --- a/seqkmer/src/fastq.rs +++ b/seqkmer/src/fastq.rs @@ -257,3 +257,126 @@ where Ok(Some(seqs).filter(|v| !v.is_empty())) } } + +use crate::seq::OptionPair; + +pub struct FastxPairReader { + inner: OptionPair>, + file_index: usize, + reads_index: usize, + // 批量读取 + batch_size: usize, +} + +impl FastxPairReader +where + R: Read + Send, +{ + pub fn new(readers: OptionPair, file_index: usize, quality_score: i32) -> Self { + Self::with_capacity(readers, file_index, BUFSIZE, quality_score, 30) + } + + pub fn with_capacity<'a>( + readers: OptionPair, + file_index: usize, + capacity: usize, + quality_score: i32, + batch_size: usize, + ) -> Self { + assert!(capacity >= 3); + let inner = match readers { + OptionPair::Single(reader) => { + OptionPair::Single(QReader::with_capacity(reader, capacity, quality_score)) + } + OptionPair::Pair(reader1, reader2) => OptionPair::Pair( + QReader::with_capacity(reader1, capacity, quality_score), + QReader::with_capacity(reader2, capacity, quality_score), + ), + }; + Self { + inner, + file_index, + reads_index: 0, + batch_size, + } + } + + fn create_seq_header(reader: &QReader, file_index: usize, reads_index: usize) -> SeqHeader { + let seq_id = unsafe { + let s = std::str::from_utf8_unchecked(&reader.header[1..]); + let first_space_index = s + .as_bytes() + .iter() + .position(|&c| c == b' ') + .unwrap_or(s.len()); + + // 直接从原始切片创建第一个单词的切片 + &s[..first_space_index] + }; + SeqHeader { + file_index, + reads_index, + format: SeqFormat::Fastq, + id: trim_pair_info(seq_id), + } + } + + pub fn read_next(&mut self) -> Result> { + match &mut self.inner { + OptionPair::Single(reader) => { + if reader.read_next()?.is_none() { + return Ok(None); + } + + self.reads_index += 1; + + let seq_header = + Self::create_seq_header(&reader, self.file_index, self.reads_index); + Ok(Some(BaseType::Single(seq_header, reader.seq.to_owned()))) + } + OptionPair::Pair(reader1, reader2) => { + if reader1.read_next()?.is_none() { + return Ok(None); + } + if reader2.read_next()?.is_none() { + return Ok(None); + } + + self.reads_index += 1; + let seq_header = + Self::create_seq_header(&reader1, self.file_index, self.reads_index); + + Ok(Some(BaseType::Pair( + seq_header, + reader1.seq.to_owned(), + reader2.seq.to_owned(), + ))) + } + } + } +} + +impl FastxPairReader> { + #[inline] + pub fn from_path>( + paths: OptionPair

, + file_index: usize, + quality_score: i32, + ) -> Result { + let readers = paths.map(|path| dyn_reader(path))?; + Ok(Self::new(readers, file_index, quality_score)) + } +} + +impl Reader for FastxPairReader +where + R: Read + Send, +{ + fn next(&mut self) -> Result> { + let seqs: SeqVecType = (0..self.batch_size) + .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> + .collect::>>()?; + + Ok(Some(seqs).filter(|v| !v.is_empty())) + } +} diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs index 3c9350c..9faf476 100644 --- a/seqkmer/src/lib.rs +++ b/seqkmer/src/lib.rs @@ -5,6 +5,7 @@ mod mmscanner; mod parallel; mod reader; mod seq; +mod utils; pub use fasta::*; pub use fastq::*; diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs index d22321f..3f274f7 100644 --- a/seqkmer/src/reader.rs +++ b/seqkmer/src/reader.rs @@ -4,7 +4,7 @@ use std::fs::File; use std::io::{self, BufRead, BufReader, Read, Result, Seek}; use std::path::Path; -pub fn dyn_reader>(path: P) -> Result> { +pub(crate) fn dyn_reader>(path: P) -> Result> { let mut file = open_file(path)?; if is_gzipped(&mut file)? { let decoder = GzDecoder::new(file); @@ -14,7 +14,7 @@ pub fn dyn_reader>(path: P) -> Result> { } } -pub fn is_gzipped(file: &mut File) -> Result { +pub(crate) fn is_gzipped(file: &mut File) -> Result { let mut buffer = [0; 2]; file.read_exact(&mut buffer)?; file.rewind()?; // 重置文件指针到开头 @@ -42,7 +42,7 @@ pub fn open_file>(path: P) -> Result { }) } -pub fn detect_file_format>(path: P) -> io::Result { +pub(crate) fn detect_file_format>(path: P) -> io::Result { let mut file = open_file(path)?; let read1: Box = if is_gzipped(&mut file)? { Box::new(GzDecoder::new(file)) @@ -80,7 +80,7 @@ pub fn detect_file_format>(path: P) -> io::Result { )) } -pub fn trim_end(buffer: &mut Vec) { +pub(crate) fn trim_end(buffer: &mut Vec) { while let Some(&b'\n' | &b'\r' | &b'>' | &b'@') = buffer.last() { buffer.pop(); } @@ -147,17 +147,4 @@ impl BaseType> { BaseType::Pair(_, hit1, hit2) => hit1.marker_size + hit2.marker_size, } } - - // pub fn seq_size_str(&self) -> BaseType<(), String> { - // self.apply(|_, hit| hit.seq_size.to_string()) - // } - - // pub fn fmt_seq_size(&self) -> String { - // match &self { - // BaseType::Single(_, hit) => hit.seq_size.to_string(), - // BaseType::Pair(_, hit1, hit2) => { - // format!("{}|{}", hit1.seq_size, hit2.seq_size) - // } - // } - // } } diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs index d17a7dd..f78cf57 100644 --- a/seqkmer/src/seq.rs +++ b/seqkmer/src/seq.rs @@ -2,7 +2,6 @@ pub enum SeqFormat { Fasta, Fastq, - PairFastq, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -13,6 +12,48 @@ pub struct SeqHeader { pub format: SeqFormat, } +#[derive(Debug, Clone)] +pub enum OptionPair { + Single(T), + Pair(T, T), +} + +impl OptionPair { + // 它接受一个泛型闭包 F,并返回一个新的 OptionPair + pub fn map(self, mut f: F) -> Result, E> + where + F: FnMut(T) -> Result, + { + match self { + OptionPair::Single(t) => f(t).map(OptionPair::Single), + OptionPair::Pair(t1, t2) => { + let u1 = f(t1)?; + let u2 = f(t2)?; + Ok(OptionPair::Pair(u1, u2)) + } + } + } +} + +impl OptionPair { + pub fn from_slice(slice: &[T]) -> OptionPair { + match slice { + [a, b] => OptionPair::Pair(a.clone(), b.clone()), + [a] => OptionPair::Single(a.clone()), + _ => unreachable!(), + } + } +} + +impl From<(T, Option)> for OptionPair { + fn from(tuple: (T, Option)) -> Self { + match tuple { + (a, Some(b)) => OptionPair::Pair(a, b), + (a, None) => OptionPair::Single(a), + } + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BaseType { Single(S, T), diff --git a/seqkmer/src/utils.rs b/seqkmer/src/utils.rs new file mode 100644 index 0000000..7722fe9 --- /dev/null +++ b/seqkmer/src/utils.rs @@ -0,0 +1,41 @@ +#[derive(Debug, Clone)] +pub enum OptionPair { + Single(S, T), + Pair(S, T, T), +} + +impl OptionPair { + // 它接受一个泛型闭包 F,并返回一个新的 OptionPair + pub fn map(self, mut f: F) -> Result, E> + where + F: FnMut(S, T) -> Result, + { + match self { + OptionPair::Single(s, t) => f(s, t).map(|u| OptionPair::Single(s.clone(), u)), + OptionPair::Pair(s, t1, t2) => { + let u1 = f(s, t1)?; + let u2 = f(s, t2)?; + Ok(OptionPair::Pair(s, u1, u2)) + } + } + } +} + +impl OptionPair { + pub fn from_slice(s: S, slice: &[T]) -> OptionPair { + match slice { + [a, b] => OptionPair::Pair(s, a.clone(), b.clone()), + [a] => OptionPair::Single(s, a.clone()), + _ => unreachable!(), + } + } +} + +impl From<(S, T, Option)> for OptionPair { + fn from(tuple: (S, T, Option)) -> Self { + match tuple { + (s, a, Some(b)) => OptionPair::Pair(s, a, b), + (s, a, None) => OptionPair::Single(s, a), + } + } +} From d5449dddf49cf76a447502761d4407939c487042 Mon Sep 17 00:00:00 2001 From: dagou Date: Wed, 19 Jun 2024 16:47:58 +0800 Subject: [PATCH 09/18] seq kmer --- seqkmer/src/fasta.rs | 36 ++++++++++++++----- seqkmer/src/fastq.rs | 71 +++++++++++++++++++++++++++++++++++++- seqkmer/src/parallel.rs | 2 -- seqkmer/src/utils.rs | 76 ++++++++++++++++++++--------------------- 4 files changed, 136 insertions(+), 49 deletions(-) diff --git a/seqkmer/src/fasta.rs b/seqkmer/src/fasta.rs index 9bbbf33..350ed08 100644 --- a/seqkmer/src/fasta.rs +++ b/seqkmer/src/fasta.rs @@ -1,5 +1,5 @@ use crate::reader::{dyn_reader, trim_end, Reader, SeqVecType, BUFSIZE}; -use crate::seq::{BaseType, SeqFormat, SeqHeader}; +use crate::seq::{self, BaseType, SeqFormat, SeqHeader}; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; @@ -34,6 +34,21 @@ where } } + pub fn read_next_entry(&mut self) -> Result, Vec)>> { + // 读取fastq文件header部分 + let mut header = Vec::new(); + if self.reader.read_until(b'\n', &mut header)? == 0 { + return Ok(None); + } + // 读取fasta文件seq部分 + let mut seq = Vec::new(); + if self.reader.read_until(b'>', &mut seq)? == 0 { + return Ok(None); + } + trim_end(&mut self.seq); + Ok(Some((header, seq))) + } + pub fn read_next(&mut self) -> Result> { // 读取fastq文件header部分 self.header.clear(); @@ -66,20 +81,25 @@ fn check_sequence_length(seq: &Vec) -> bool { impl Reader for FastaReader { fn next(&mut self) -> Result> { - if self.read_next()?.is_none() { + // if self.read_next()?.is_none() { + // return Ok(None); + // } + + let entry = self.read_next_entry()?; + if entry.is_none() { return Ok(None); } - - if check_sequence_length(&self.seq) { + let (header, seq) = entry.unwrap(); + if check_sequence_length(&seq) { eprintln!("Sequence length exceeds 2^32, which is not handled."); return Ok(None); } let seq_id = unsafe { - let slice = if self.header.starts_with(b">") { - &self.header[1..] + let slice = if header.starts_with(b">") { + &header[1..] } else { - &self.header[..] + &header[..] }; let s = std::str::from_utf8_unchecked(slice); @@ -100,7 +120,7 @@ impl Reader for FastaReader { format: SeqFormat::Fasta, id: seq_id.to_owned(), }; - let seq = BaseType::Single(seq_header, self.seq.to_owned()); + let seq = BaseType::Single(seq_header, seq); Ok(Some(vec![seq])) } } diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs index ed035f7..e0b6c21 100644 --- a/seqkmer/src/fastq.rs +++ b/seqkmer/src/fastq.rs @@ -31,6 +31,43 @@ where } } + pub fn read_next_entry(&mut self) -> Result, Vec)>> { + // 读取fastq文件header部分 + let mut header: Vec = Vec::new(); + if self.reader.read_until(b'\n', &mut header)? == 0 { + return Ok(None); + } + // 读取fastq文件seq部分 + let mut seq = Vec::new(); + if self.reader.read_until(b'\n', &mut seq)? == 0 { + return Ok(None); + } + trim_end(&mut seq); + + // 读取fastq文件+部分 + self.plus.clear(); + if self.reader.read_until(b'\n', &mut self.plus)? == 0 { + return Ok(None); + } + + // 读取fastq文件quals部分 + self.quals.clear(); + if self.reader.read_until(b'\n', &mut self.quals)? == 0 { + return Ok(None); + } + trim_end(&mut self.quals); + + if self.quality_score > 0 { + for (base, &qscore) in self.seq.iter_mut().zip(self.quals.iter()) { + if (qscore as i32 - '!' as i32) < self.quality_score { + *base = b'x'; + } + } + } + + Ok(Some((header, seq))) + } + pub fn read_next(&mut self) -> Result> { // 读取fastq文件header部分 self.header.clear(); @@ -102,6 +139,38 @@ where } } + pub fn read_next_entry(&mut self) -> Result> { + let entry = self.inner.read_next_entry()?; + if entry.is_none() { + return Ok(None); + } + + let (header, seq) = entry.unwrap(); + + let seq_id = unsafe { + let s = std::str::from_utf8_unchecked(&header[1..]); + let first_space_index = s + .as_bytes() + .iter() + .position(|&c| c == b' ') + .unwrap_or(s.len()); + + // 直接从原始切片创建第一个单词的切片 + &s[..first_space_index] + }; + self.reads_index += 1; + + let seq_header = SeqHeader { + file_index: self.file_index, + reads_index: self.reads_index, + format: SeqFormat::Fasta, + id: seq_id.to_owned(), + }; + + let seq = BaseType::Single(seq_header, seq); + Ok(Some(seq)) + } + pub fn read_next(&mut self) -> Result> { if self.inner.read_next()?.is_none() { return Ok(None); @@ -150,7 +219,7 @@ where { fn next(&mut self) -> Result> { let seqs: SeqVecType = (0..self.batch_size) - .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> + .filter_map(|_| self.read_next_entry().transpose()) // 将 Result, _> 转换为 Option> .collect::>>()?; Ok(Some(seqs).filter(|v| !v.is_empty())) diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index b54249e..ac829e3 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -45,8 +45,6 @@ pub fn create_reader( } } SeqFormat::Fasta => Ok(Box::new(FastaReader::from_path(file1, file_index)?)), - - _ => unreachable!(), } } diff --git a/seqkmer/src/utils.rs b/seqkmer/src/utils.rs index 7722fe9..eae4dd9 100644 --- a/seqkmer/src/utils.rs +++ b/seqkmer/src/utils.rs @@ -1,41 +1,41 @@ -#[derive(Debug, Clone)] -pub enum OptionPair { - Single(S, T), - Pair(S, T, T), -} +// #[derive(Debug, Clone)] +// pub enum OptionPair { +// Single(S, T), +// Pair(S, T, T), +// } -impl OptionPair { - // 它接受一个泛型闭包 F,并返回一个新的 OptionPair - pub fn map(self, mut f: F) -> Result, E> - where - F: FnMut(S, T) -> Result, - { - match self { - OptionPair::Single(s, t) => f(s, t).map(|u| OptionPair::Single(s.clone(), u)), - OptionPair::Pair(s, t1, t2) => { - let u1 = f(s, t1)?; - let u2 = f(s, t2)?; - Ok(OptionPair::Pair(s, u1, u2)) - } - } - } -} +// impl OptionPair { +// // 它接受一个泛型闭包 F,并返回一个新的 OptionPair +// pub fn map(self, mut f: F) -> Result, E> +// where +// F: FnMut(S, T) -> Result, +// { +// match self { +// OptionPair::Single(s, t) => f(s, t).map(|u| OptionPair::Single(s.clone(), u)), +// OptionPair::Pair(s, t1, t2) => { +// let u1 = f(s, t1)?; +// let u2 = f(s, t2)?; +// Ok(OptionPair::Pair(s, u1, u2)) +// } +// } +// } +// } -impl OptionPair { - pub fn from_slice(s: S, slice: &[T]) -> OptionPair { - match slice { - [a, b] => OptionPair::Pair(s, a.clone(), b.clone()), - [a] => OptionPair::Single(s, a.clone()), - _ => unreachable!(), - } - } -} +// impl OptionPair { +// pub fn from_slice(s: S, slice: &[T]) -> OptionPair { +// match slice { +// [a, b] => OptionPair::Pair(s, a.clone(), b.clone()), +// [a] => OptionPair::Single(s, a.clone()), +// _ => unreachable!(), +// } +// } +// } -impl From<(S, T, Option)> for OptionPair { - fn from(tuple: (S, T, Option)) -> Self { - match tuple { - (s, a, Some(b)) => OptionPair::Pair(s, a, b), - (s, a, None) => OptionPair::Single(s, a), - } - } -} +// impl From<(S, T, Option)> for OptionPair { +// fn from(tuple: (S, T, Option)) -> Self { +// match tuple { +// (s, a, Some(b)) => OptionPair::Pair(s, a, b), +// (s, a, None) => OptionPair::Single(s, a), +// } +// } +// } From 62219dca15045710eca37a22aa3d5f7277750f74 Mon Sep 17 00:00:00 2001 From: dagou Date: Wed, 19 Jun 2024 20:46:47 +0800 Subject: [PATCH 10/18] seq kmer --- kr2r/src/bin/classify.rs | 12 +-- kr2r/src/bin/estimate_capacity.rs | 18 ++-- kr2r/src/bin/splitr.rs | 4 +- kr2r/src/classify.rs | 27 +++--- kr2r/src/db.rs | 3 +- seqkmer/src/fasta.rs | 22 +++-- seqkmer/src/fastq.rs | 47 +++++----- seqkmer/src/lib.rs | 1 + seqkmer/src/mmscanner.rs | 43 +++++---- seqkmer/src/parallel.rs | 8 +- seqkmer/src/reader.rs | 20 ++--- seqkmer/src/seq.rs | 141 ++++------------------------- seqkmer/src/utils.rs | 142 +++++++++++++++++++++--------- 13 files changed, 220 insertions(+), 268 deletions(-) diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index 057e0ac..9bbd612 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -6,9 +6,7 @@ use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{create_sample_file, find_and_sort_files, get_lastest_file_index}; use kr2r::IndexOptions; -use seqkmer::{ - create_reader, read_parallel, BaseType, HitGroup, Meros, MinimizerIterator, Reader, SeqHeader, -}; +use seqkmer::{create_reader, read_parallel, Base, HitGroup, Meros, MinimizerIterator, Reader}; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -114,7 +112,7 @@ fn process_seq( } fn process_record( - marker: &mut BaseType, + marker: &mut Base, args: &Args, taxonomy: &Taxonomy, chtable: &CHTable, @@ -122,8 +120,10 @@ fn process_record( cur_taxon_counts: &TaxonCountersDash, classify_counter: &AtomicUsize, ) -> String { - let id = marker.get_s().clone().id; - let hits = marker.apply_mut(|_, m_iter| process_seq(m_iter, &hash_config, chtable)); + let id = &marker.header.id; + let hits = marker + .body + .apply_mut(|m_iter| process_seq(m_iter, &hash_config, chtable)); let total_kmers = hits.total_marker_size(); let seq_len_str = marker.fmt_seq_size(); diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs index e046b33..5b182b6 100644 --- a/kr2r/src/bin/estimate_capacity.rs +++ b/kr2r/src/bin/estimate_capacity.rs @@ -94,14 +94,16 @@ fn process_sequence( let mut minimizer_set = HashSet::new(); for record in record_set { - record.fold(&mut minimizer_set, |minimizer_set, _, m_iter| { - let kmer_iter: HashSet = m_iter - .filter(|(_, hash_key)| *hash_key & RANGE_MASK < range_n) - .map(|(_, hash_key)| hash_key) - .collect(); - - minimizer_set.extend(kmer_iter); - }); + record + .body + .fold(&mut minimizer_set, |minimizer_set, m_iter| { + let kmer_iter: HashSet = m_iter + .filter(|(_, hash_key)| *hash_key & RANGE_MASK < range_n) + .map(|(_, hash_key)| hash_key) + .collect(); + + minimizer_set.extend(kmer_iter); + }); } Some(minimizer_set) }, diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index e5b6d02..cd8749b 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -156,10 +156,10 @@ where let mut k2_slot_list = Vec::new(); for seq in seqs { let mut init: Vec<(usize, Slot)> = Vec::new(); - let header = seq.get_s(); + let header = &seq.header; let index = header.reads_index; let dna_id = header.id.clone(); - seq.fold(&mut init, |init, _, mut m_iter| { + seq.body.fold(&mut init, |init, mut m_iter| { let seq_id = (file_index << 32 | index) as u64; process_record( init, diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index 2b6e4cd..5dd5597 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -1,7 +1,7 @@ use crate::compact_hash::{Compact, HashConfig, Row}; use crate::readcounts::TaxonCountersDash; use crate::taxonomy::Taxonomy; -use seqkmer::{BaseType, HitGroup}; +use seqkmer::{HitGroup, OptionPair}; use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -158,7 +158,7 @@ pub fn count_values( } fn stat_hits( - hits: &BaseType<(), HitGroup>, + hits: &OptionPair>, cur_taxon_counts: &TaxonCountersDash, counts: &mut HashMap, value_mask: usize, @@ -167,22 +167,26 @@ fn stat_hits( // let mut counts = HashMap::new(); let mut hit_count: usize = 0; - let hit_str = hits.apply(|_, group| { + let hit_str = hits.apply(|group| { let mut last_pos = 0; let count = group.marker_size as u32; let mut result = Vec::new(); + let mut last_row: Row = Row::new(0, 0, 0); for row in &group.rows { // 统计计数 let value = row.value; let key = value.right(value_mask); *counts.entry(key).or_insert(0) += 1; - cur_taxon_counts - .entry(key as u64) - .or_default() - .add_kmer(value as u64); - hit_count += 1; + if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { + cur_taxon_counts + .entry(key as u64) + .or_default() + .add_kmer(value as u64); + hit_count += 1; + } + last_row = *row; let adjusted_pos = row.kmer_id - group.offset; @@ -224,15 +228,12 @@ fn stat_hits( .join(" ") }); - let hit_string = match hit_str { - BaseType::Single(_, hit) => hit, - BaseType::Pair(_, hit1, hit2) => format!("{} |:| {}", hit1, hit2), - }; + let hit_string = hit_str.reduce_str(" |:| ", |str| str.to_owned()); (hit_count, hit_string) } pub fn process_hitgroup( - hits: &BaseType<(), HitGroup>, + hits: &OptionPair>, hash_config: &HashConfig, taxonomy: &Taxonomy, cur_taxon_counts: &TaxonCountersDash, diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index 6f28bf6..29a6545 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -265,7 +265,8 @@ pub fn convert_fna_to_k2_format>( let mut k2_cell_list = Vec::new(); for record in seqs { - record.fold(&mut k2_cell_list, |k2_cell_list, header, m_iter| { + let header = &record.header; + record.body.fold(&mut k2_cell_list, |k2_cell_list, m_iter| { if let Some(ext_taxid) = id_to_taxon_map.get(&header.id) { let taxid = taxonomy.get_internal_id(*ext_taxid); let k2_cell: Vec<(usize, Slot)> = m_iter diff --git a/seqkmer/src/fasta.rs b/seqkmer/src/fasta.rs index 350ed08..c2a7c1b 100644 --- a/seqkmer/src/fasta.rs +++ b/seqkmer/src/fasta.rs @@ -1,5 +1,6 @@ -use crate::reader::{dyn_reader, trim_end, Reader, SeqVecType, BUFSIZE}; -use crate::seq::{self, BaseType, SeqFormat, SeqHeader}; +use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; +use crate::seq::{Base, SeqFormat, SeqHeader}; +use crate::utils::OptionPair; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; @@ -36,16 +37,20 @@ where pub fn read_next_entry(&mut self) -> Result, Vec)>> { // 读取fastq文件header部分 - let mut header = Vec::new(); - if self.reader.read_until(b'\n', &mut header)? == 0 { + self.header.clear(); + if self.reader.read_until(b'\n', &mut self.header)? == 0 { return Ok(None); } + let mut header = Vec::with_capacity(self.header.len()); + header.extend_from_slice(&self.header); // 读取fasta文件seq部分 - let mut seq = Vec::new(); - if self.reader.read_until(b'>', &mut seq)? == 0 { + self.seq.clear(); + if self.reader.read_until(b'>', &mut self.seq)? == 0 { return Ok(None); } trim_end(&mut self.seq); + let mut seq = Vec::with_capacity(self.seq.len()); + seq.extend_from_slice(&self.seq); Ok(Some((header, seq))) } @@ -80,7 +85,7 @@ fn check_sequence_length(seq: &Vec) -> bool { } impl Reader for FastaReader { - fn next(&mut self) -> Result> { + fn next(&mut self) -> Result>>>> { // if self.read_next()?.is_none() { // return Ok(None); // } @@ -120,7 +125,6 @@ impl Reader for FastaReader { format: SeqFormat::Fasta, id: seq_id.to_owned(), }; - let seq = BaseType::Single(seq_header, seq); - Ok(Some(vec![seq])) + Ok(Some(vec![Base::new(seq_header, OptionPair::Single(seq))])) } } diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs index e0b6c21..4e65337 100644 --- a/seqkmer/src/fastq.rs +++ b/seqkmer/src/fastq.rs @@ -1,10 +1,8 @@ -use crate::reader::{dyn_reader, trim_end, trim_pair_info, Reader, SeqVecType, BUFSIZE}; -use crate::seq::{BaseType, SeqFormat, SeqHeader}; +use crate::reader::{dyn_reader, trim_end, trim_pair_info, Reader, BUFSIZE}; +use crate::seq::{Base, SeqFormat, SeqHeader}; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; -type SeqType = BaseType>; - struct QReader { reader: BufReader, quality_score: i32, @@ -139,7 +137,7 @@ where } } - pub fn read_next_entry(&mut self) -> Result> { + pub fn read_next_entry(&mut self) -> Result>>> { let entry = self.inner.read_next_entry()?; if entry.is_none() { return Ok(None); @@ -167,11 +165,11 @@ where id: seq_id.to_owned(), }; - let seq = BaseType::Single(seq_header, seq); + let seq = Base::new(seq_header, OptionPair::Single(seq)); Ok(Some(seq)) } - pub fn read_next(&mut self) -> Result> { + pub fn read_next(&mut self) -> Result>>> { if self.inner.read_next()?.is_none() { return Ok(None); } @@ -196,7 +194,7 @@ where id: seq_id.to_owned(), }; - let seq = BaseType::Single(seq_header, self.inner.seq.to_owned()); + let seq = Base::new(seq_header, OptionPair::Single(self.inner.seq.to_owned())); Ok(Some(seq)) } } @@ -217,8 +215,8 @@ impl Reader for FastqReader where R: Read + Send, { - fn next(&mut self) -> Result> { - let seqs: SeqVecType = (0..self.batch_size) + fn next(&mut self) -> Result>>>> { + let seqs: Vec>> = (0..self.batch_size) .filter_map(|_| self.read_next_entry().transpose()) // 将 Result, _> 转换为 Option> .collect::>>()?; @@ -262,7 +260,7 @@ where } } - pub fn read_next(&mut self) -> Result> { + pub fn read_next(&mut self) -> Result>>> { if self.inner1.read_next()?.is_none() { return Ok(None); } @@ -291,10 +289,9 @@ where id: trim_pair_info(seq_id), }; - let sequence = BaseType::Pair( + let sequence = Base::new( seq_header, - self.inner1.seq.to_owned(), - self.inner2.seq.to_owned(), + OptionPair::Pair(self.inner1.seq.to_owned(), self.inner2.seq.to_owned()), ); Ok(Some(sequence)) } @@ -318,8 +315,8 @@ impl Reader for FastqPairReader where R: Read + Send, { - fn next(&mut self) -> Result> { - let seqs: SeqVecType = (0..self.batch_size) + fn next(&mut self) -> Result>>>> { + let seqs: Vec>> = (0..self.batch_size) .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> .collect::>>()?; @@ -327,7 +324,7 @@ where } } -use crate::seq::OptionPair; +use crate::utils::OptionPair; pub struct FastxPairReader { inner: OptionPair>, @@ -390,7 +387,7 @@ where } } - pub fn read_next(&mut self) -> Result> { + pub fn read_next(&mut self) -> Result>>> { match &mut self.inner { OptionPair::Single(reader) => { if reader.read_next()?.is_none() { @@ -401,7 +398,10 @@ where let seq_header = Self::create_seq_header(&reader, self.file_index, self.reads_index); - Ok(Some(BaseType::Single(seq_header, reader.seq.to_owned()))) + Ok(Some(Base::new( + seq_header, + OptionPair::Single(reader.seq.to_owned()), + ))) } OptionPair::Pair(reader1, reader2) => { if reader1.read_next()?.is_none() { @@ -415,10 +415,9 @@ where let seq_header = Self::create_seq_header(&reader1, self.file_index, self.reads_index); - Ok(Some(BaseType::Pair( + Ok(Some(Base::new( seq_header, - reader1.seq.to_owned(), - reader2.seq.to_owned(), + OptionPair::Pair(reader1.seq.to_owned(), reader2.seq.to_owned()), ))) } } @@ -441,8 +440,8 @@ impl Reader for FastxPairReader where R: Read + Send, { - fn next(&mut self) -> Result> { - let seqs: SeqVecType = (0..self.batch_size) + fn next(&mut self) -> Result>>>> { + let seqs: Vec>> = (0..self.batch_size) .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> .collect::>>()?; diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs index 9faf476..6332f98 100644 --- a/seqkmer/src/lib.rs +++ b/seqkmer/src/lib.rs @@ -15,3 +15,4 @@ pub use mmscanner::MinimizerIterator; pub use parallel::*; pub use reader::*; pub use seq::*; +pub use utils::OptionPair; diff --git a/seqkmer/src/mmscanner.rs b/seqkmer/src/mmscanner.rs index 9775f9a..9b79cbf 100644 --- a/seqkmer/src/mmscanner.rs +++ b/seqkmer/src/mmscanner.rs @@ -1,5 +1,6 @@ // kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash -use crate::seq::{BaseType, SeqHeader}; +use crate::seq::Base; +use crate::utils::OptionPair; use crate::{ canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, }; @@ -221,42 +222,38 @@ impl<'a> Iterator for MinimizerIterator<'a> { } } -impl<'a> BaseType> { - pub fn seq_size_str(&self) -> BaseType<(), String> { - self.apply(|_, m_iter| m_iter.seq_size().to_string()) +impl<'a> Base> { + pub fn seq_size_str(&self) -> OptionPair { + self.body.apply(|m_iter| m_iter.seq_size().to_string()) } pub fn fmt_seq_size(&self) -> String { - match &self { - BaseType::Single(_, m_iter) => m_iter.seq_size().to_string(), - BaseType::Pair(_, m_iter1, m_iter2) => { - format!("{}|{}", m_iter1.seq_size(), m_iter2.seq_size()) - } - } + self.body + .reduce_str("|", |m_iter| m_iter.seq_size().to_string()) } pub fn fmt_size(&self) -> String { - match &self { - BaseType::Single(_, m_iter) => m_iter.size.to_string(), - BaseType::Pair(_, m_iter1, m_iter2) => { - format!("{}|{}", m_iter1.size, m_iter2.size) - } - } + self.body.reduce_str("|", |m_iter| m_iter.size.to_string()) } } + pub fn scan_sequence<'a>( - sequence: &'a BaseType>, + sequence: &'a Base>, meros: &'a Meros, -) -> BaseType> { +) -> Base> { let func = |seq: &'a Vec| { let cursor = Cursor::new(meros.l_mer, meros.mask); let window = MinimizerWindow::new(meros.window_size()); - MinimizerIterator::new(&seq, cursor, window, meros) + MinimizerIterator::new(seq, cursor, window, meros) }; - match sequence { - BaseType::Single(header, seq) => BaseType::Single(header.clone(), func(seq)), - BaseType::Pair(header, seq1, seq2) => { - BaseType::Pair(header.clone(), func(seq1), func(seq2)) + + match &sequence.body { + OptionPair::Pair(seq1, seq2) => Base::new( + sequence.header.clone(), + OptionPair::Pair(func(&seq1), func(&seq2)), + ), + OptionPair::Single(seq1) => { + Base::new(sequence.header.clone(), OptionPair::Single(func(&seq1))) } } } diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index ac829e3..47a4ada 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -2,7 +2,7 @@ use crate::fasta::FastaReader; use crate::fastq::{FastqPairReader, FastqReader}; use crate::mmscanner::{scan_sequence, MinimizerIterator}; use crate::reader::{detect_file_format, Reader}; -use crate::seq::{BaseType, SeqHeader}; +use crate::seq::Base; use crate::{Meros, SeqFormat}; use crossbeam_channel::{bounded, Receiver}; use scoped_threadpool::Pool; @@ -60,12 +60,12 @@ where R: Reader, O: Send, Out: Send + Default, - W: Send + Sync + Fn(&mut Vec>) -> Option, + W: Send + Sync + Fn(&mut Vec>) -> Option, F: FnOnce(&mut ParallelResult>) -> Out + Send, { assert!(n_threads > 2); assert!(n_threads <= buffer_len); - let (sender, receiver) = bounded::>>>(buffer_len); + let (sender, receiver) = bounded::>>>(buffer_len); let (done_send, done_recv) = bounded::>(buffer_len); let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver let done_send = Arc::new(done_send); @@ -88,7 +88,7 @@ where let done_send = Arc::clone(&done_send); pool_scope.execute(move || { while let Ok(mut seqs) = receiver.recv() { - let mut markers: Vec>> = seqs + let mut markers: Vec>> = seqs .iter_mut() .map(|seq| scan_sequence(seq, &meros)) .collect(); diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs index 3f274f7..7a7f202 100644 --- a/seqkmer/src/reader.rs +++ b/seqkmer/src/reader.rs @@ -1,4 +1,5 @@ -use crate::seq::{BaseType, SeqFormat, SeqHeader}; +use crate::seq::{Base, SeqFormat}; +use crate::utils::OptionPair; use flate2::read::GzDecoder; use std::fs::File; use std::io::{self, BufRead, BufReader, Read, Result, Seek}; @@ -88,14 +89,14 @@ pub(crate) fn trim_end(buffer: &mut Vec) { pub const BUFSIZE: usize = 16 * 1024 * 1024; -pub type SeqVecType = Vec>>; +// pub type SeqVecType = Vec; pub trait Reader: Send { - fn next(&mut self) -> Result>; + fn next(&mut self) -> Result>>>>; } impl Reader for Box { - fn next(&mut self) -> Result> { + fn next(&mut self) -> Result>>>> { (**self).next() } } @@ -120,7 +121,7 @@ impl HitGroup { } } -impl BaseType> { +impl OptionPair> { /// Synchronizes the offset of the second element of a `Pair` to the `cap` of the first element. /// This alignment is only necessary when the `rows` property of the `HitGroup` is in an /// increasing order. If `rows` is not increasing, aligning the offset based on `cap` may not @@ -132,19 +133,16 @@ impl BaseType> { /// let mut hit_group1 = HitGroup::new(10, vec![1, 2, 3], 0); // Increasing `rows` /// let mut hit_group2 = HitGroup::new(20, vec![4, 5, 6], 0); /// - /// let mut pair = BaseType::Pair((hit_group1, hit_group2)); + /// let mut pair = OptionPair::Pair((hit_group1, hit_group2)); /// pair.align_offset(); /// ``` pub fn align_offset(&mut self) { - if let BaseType::Pair(_, ref first, ref mut second) = self { + if let OptionPair::Pair(ref first, ref mut second) = self { second.offset = first.marker_size as u32; } } pub fn total_marker_size(&self) -> usize { - match &self { - BaseType::Single(_, hit) => hit.marker_size, - BaseType::Pair(_, hit1, hit2) => hit1.marker_size + hit2.marker_size, - } + self.reduce(0, |acc, hit| acc + hit.marker_size) } } diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs index f78cf57..aacfbaf 100644 --- a/seqkmer/src/seq.rs +++ b/seqkmer/src/seq.rs @@ -1,3 +1,5 @@ +use crate::utils::OptionPair; + #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum SeqFormat { Fasta, @@ -12,137 +14,24 @@ pub struct SeqHeader { pub format: SeqFormat, } -#[derive(Debug, Clone)] -pub enum OptionPair { - Single(T), - Pair(T, T), +#[derive(Debug)] +pub struct Base { + pub header: SeqHeader, + pub body: OptionPair, } -impl OptionPair { - // 它接受一个泛型闭包 F,并返回一个新的 OptionPair - pub fn map(self, mut f: F) -> Result, E> - where - F: FnMut(T) -> Result, - { - match self { - OptionPair::Single(t) => f(t).map(OptionPair::Single), - OptionPair::Pair(t1, t2) => { - let u1 = f(t1)?; - let u2 = f(t2)?; - Ok(OptionPair::Pair(u1, u2)) - } - } +impl Base { + pub fn new(header: SeqHeader, body: OptionPair) -> Self { + Self { header, body } } -} -impl OptionPair { - pub fn from_slice(slice: &[T]) -> OptionPair { - match slice { - [a, b] => OptionPair::Pair(a.clone(), b.clone()), - [a] => OptionPair::Single(a.clone()), - _ => unreachable!(), - } - } -} - -impl From<(T, Option)> for OptionPair { - fn from(tuple: (T, Option)) -> Self { - match tuple { - (a, Some(b)) => OptionPair::Pair(a, b), - (a, None) => OptionPair::Single(a), - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum BaseType { - Single(S, T), - Pair(S, T, T), -} - -impl BaseType { - // 泛型方法,根据序列类型执行操作 - pub fn apply<'a, U, F>(&'a self, mut func: F) -> BaseType<(), U> - where - F: FnMut(&'a S, &'a T) -> U, - { - match self { - BaseType::Single(prop, seq) => BaseType::Single((), func(prop, seq)), - BaseType::Pair(prop, seq1, seq2) => { - BaseType::Pair((), func(prop, seq1), func(prop, seq2)) - } - } - } - - pub fn apply_mut<'a, U, F>(&'a mut self, mut func: F) -> BaseType<(), U> + pub fn map(&self, mut f: F) -> Result, E> where - F: FnMut(&'a S, &'a mut T) -> U, + F: FnMut(&T) -> Result, { - match self { - BaseType::Single(prop, seq) => BaseType::Single((), func(prop, seq)), - BaseType::Pair(prop, seq1, seq2) => { - BaseType::Pair((), func(prop, seq1), func(prop, seq2)) - } - } - } - - pub fn get_s(&self) -> &S { - match self { - BaseType::Single(prop, _) => prop, - BaseType::Pair(prop, _, _) => prop, - } - } - - pub fn transform<'a, U, F, V>(&mut self, mut func: F) -> BaseType - where - F: for<'b> FnMut(&S, &'b mut T) -> (V, U), - { - match self { - BaseType::Single(prop, seq) => { - let res1 = func(prop, seq); - BaseType::Single(res1.0, res1.1) - } - BaseType::Pair(prop, seq1, seq2) => { - let res1 = func(prop, seq1); - let res2 = func(prop, seq2); - BaseType::Pair(res1.0, res1.1, res2.1) - } - } - } - - pub fn fold(&mut self, init: &mut V, mut func: F) -> BaseType<(), U> - where - F: FnMut(&mut V, &S, &mut T) -> U, - { - match self { - BaseType::Single(prop, seq) => BaseType::Single((), func(init, prop, seq)), - BaseType::Pair(prop, ref mut seq1, ref mut seq2) => { - let res1 = func(init, prop, seq1); - let res2 = func(init, prop, seq2); - BaseType::Pair((), res1, res2) - } - } - } - - pub fn modify(&mut self, mut func: F) - where - F: FnMut(&S, &mut T), - { - match self { - BaseType::Single(prop, ref mut seq) => func(prop, seq), - BaseType::Pair(prop, ref mut seq1, ref mut seq2) => { - func(prop, seq1); - func(prop, seq2); - } - } - } -} - -impl BaseType> -where - S: Copy, -{ - pub fn len(&self) -> BaseType<(), usize> { - self.apply(|_, seq| seq.len()) + self.body.map(|t| f(&t)).map(|body| Base { + header: self.header.clone(), + body, + }) } } diff --git a/seqkmer/src/utils.rs b/seqkmer/src/utils.rs index eae4dd9..56f5b6c 100644 --- a/seqkmer/src/utils.rs +++ b/seqkmer/src/utils.rs @@ -1,41 +1,101 @@ -// #[derive(Debug, Clone)] -// pub enum OptionPair { -// Single(S, T), -// Pair(S, T, T), -// } - -// impl OptionPair { -// // 它接受一个泛型闭包 F,并返回一个新的 OptionPair -// pub fn map(self, mut f: F) -> Result, E> -// where -// F: FnMut(S, T) -> Result, -// { -// match self { -// OptionPair::Single(s, t) => f(s, t).map(|u| OptionPair::Single(s.clone(), u)), -// OptionPair::Pair(s, t1, t2) => { -// let u1 = f(s, t1)?; -// let u2 = f(s, t2)?; -// Ok(OptionPair::Pair(s, u1, u2)) -// } -// } -// } -// } - -// impl OptionPair { -// pub fn from_slice(s: S, slice: &[T]) -> OptionPair { -// match slice { -// [a, b] => OptionPair::Pair(s, a.clone(), b.clone()), -// [a] => OptionPair::Single(s, a.clone()), -// _ => unreachable!(), -// } -// } -// } - -// impl From<(S, T, Option)> for OptionPair { -// fn from(tuple: (S, T, Option)) -> Self { -// match tuple { -// (s, a, Some(b)) => OptionPair::Pair(s, a, b), -// (s, a, None) => OptionPair::Single(s, a), -// } -// } -// } +#[derive(Debug, Clone)] +pub enum OptionPair { + Single(T), + Pair(T, T), +} + +impl OptionPair { + // 它接受一个泛型闭包 F,并返回一个新的 OptionPair + pub fn map(&self, mut f: F) -> Result, E> + where + F: FnMut(&T) -> Result, + { + match self { + OptionPair::Single(t) => f(t).map(OptionPair::Single), + OptionPair::Pair(t1, t2) => { + let u1 = f(t1)?; + let u2 = f(t2)?; + Ok(OptionPair::Pair(u1, u2)) + } + } + } + + pub fn reduce(&self, init: U, mut f: F) -> U + where + F: FnMut(U, &T) -> U, + { + match self { + OptionPair::Single(t) => f(init, t), + OptionPair::Pair(t1, t2) => { + let result = f(init, t1); + f(result, t2) + } + } + } + + pub fn fold(&mut self, init: &mut V, mut func: F) -> OptionPair + where + F: FnMut(&mut V, &mut T) -> U, + { + match self { + OptionPair::Single(seq) => OptionPair::Single(func(init, seq)), + OptionPair::Pair(ref mut seq1, ref mut seq2) => { + let res1 = func(init, seq1); + let res2 = func(init, seq2); + OptionPair::Pair(res1, res2) + } + } + } + + pub fn reduce_str(&self, sep: &str, mut f: F) -> String + where + F: FnMut(&T) -> String, + { + self.reduce(String::new(), |acc, t| { + if acc.is_empty() { + f(t) + } else { + format!("{}{}{}", acc, sep, f(t)) + } + }) + } + + pub fn apply(&self, mut f: F) -> OptionPair + where + F: FnMut(&T) -> U, + { + match self { + OptionPair::Single(t) => OptionPair::Single(f(t)), + OptionPair::Pair(t1, t2) => OptionPair::Pair(f(t1), f(t2)), + } + } + + pub fn apply_mut(&mut self, mut f: F) -> OptionPair + where + F: FnMut(&mut T) -> U, + { + match self { + OptionPair::Single(t) => OptionPair::Single(f(t)), + OptionPair::Pair(t1, t2) => OptionPair::Pair(f(t1), f(t2)), + } + } +} + +impl OptionPair { + pub fn from_slice(slice: &[T]) -> OptionPair { + match slice { + [a, b] => OptionPair::Pair(a.clone(), b.clone()), + [a] => OptionPair::Single(a.clone()), + _ => unreachable!(), + } + } +} + +impl From<(T, Option)> for OptionPair { + fn from(tuple: (T, Option)) -> Self { + match tuple { + (a, Some(b)) => OptionPair::Pair(a, b), + (a, None) => OptionPair::Single(a), + } + } +} From 5ccdfd02c84c97065946259e72e7549a7f42a539 Mon Sep 17 00:00:00 2001 From: dagou Date: Thu, 20 Jun 2024 10:01:32 +0800 Subject: [PATCH 11/18] seq kmer --- kr2r/src/classify.rs | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index 5dd5597..be01a43 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -159,34 +159,40 @@ pub fn count_values( fn stat_hits( hits: &OptionPair>, - cur_taxon_counts: &TaxonCountersDash, counts: &mut HashMap, value_mask: usize, taxonomy: &Taxonomy, -) -> (usize, String) { +) -> (usize, TaxonCountersDash, String) { // let mut counts = HashMap::new(); let mut hit_count: usize = 0; + let cur_taxon_counts = TaxonCountersDash::new(); let hit_str = hits.apply(|group| { let mut last_pos = 0; let count = group.marker_size as u32; let mut result = Vec::new(); - let mut last_row: Row = Row::new(0, 0, 0); + // let mut last_row: Row = Row::new(0, 0, 0); for row in &group.rows { // 统计计数 let value = row.value; let key = value.right(value_mask); *counts.entry(key).or_insert(0) += 1; - if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { - cur_taxon_counts - .entry(key as u64) - .or_default() - .add_kmer(value as u64); - hit_count += 1; - } - last_row = *row; + // if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { + // cur_taxon_counts + // .entry(key as u64) + // .or_default() + // .add_kmer(value as u64); + // hit_count += 1; + // } + + cur_taxon_counts + .entry(key as u64) + .or_default() + .add_kmer(value as u64); + hit_count += 1; + // last_row = *row; let adjusted_pos = row.kmer_id - group.offset; @@ -229,7 +235,7 @@ fn stat_hits( }); let hit_string = hit_str.reduce_str(" |:| ", |str| str.to_owned()); - (hit_count, hit_string) + (hit_count, cur_taxon_counts, hit_string) } pub fn process_hitgroup( @@ -245,9 +251,15 @@ pub fn process_hitgroup( let value_mask = hash_config.value_mask; let mut counts = HashMap::new(); - let (hit_groups, hit_string) = - stat_hits(hits, cur_taxon_counts, &mut counts, value_mask, taxonomy); + let (hit_groups, cur_counts, hit_string) = stat_hits(hits, &mut counts, value_mask, taxonomy); + cur_counts.iter().for_each(|entry| { + cur_taxon_counts + .entry(*entry.key()) + .or_default() + .merge(entry.value()) + .unwrap(); + }); let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold); if call > 0 && hit_groups < minimum_hit_groups { call = 0; From e6dfa8925b5aef5712e1d8732e1209f6cece9353 Mon Sep 17 00:00:00 2001 From: dagou Date: Fri, 21 Jun 2024 17:45:50 +0800 Subject: [PATCH 12/18] seq kmer --- kr2r/src/bin/classify.rs | 45 ++-- kr2r/src/bin/estimate_capacity.rs | 18 +- kr2r/src/bin/resolve.rs | 81 ++++--- kr2r/src/bin/splitr.rs | 14 +- kr2r/src/classify.rs | 349 +++++++++++++----------------- kr2r/src/db.rs | 2 +- kr2r/src/kr2r_data.rs | 22 ++ seqkmer/src/fasta.rs | 101 +++++---- seqkmer/src/fastq.rs | 266 +---------------------- seqkmer/src/fastx.rs | 45 ++++ seqkmer/src/lib.rs | 2 + seqkmer/src/mmscanner.rs | 28 +++ seqkmer/src/parallel.rs | 28 +-- seqkmer/src/reader.rs | 138 ++++++++---- seqkmer/src/utils.rs | 33 +-- 15 files changed, 530 insertions(+), 642 deletions(-) create mode 100644 seqkmer/src/fastx.rs diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index 9bbd612..bc123c2 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -5,8 +5,8 @@ use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{create_sample_file, find_and_sort_files, get_lastest_file_index}; -use kr2r::IndexOptions; -use seqkmer::{create_reader, read_parallel, Base, HitGroup, Meros, MinimizerIterator, Reader}; +use kr2r::{HitGroup, IndexOptions}; +use seqkmer::{read_parallel, Base, FastxReader, Meros, MinimizerIterator, OptionPair, Reader}; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -86,14 +86,14 @@ pub struct Args { } fn process_seq( + rows: &mut Vec, m_iter: &mut MinimizerIterator, hash_config: &HashConfig, chtable: &CHTable, -) -> HitGroup { + offset: usize, +) -> usize { let chunk_size = hash_config.hash_capacity; let value_bits = hash_config.value_bits; - let mut rows = Vec::new(); - let data: Vec<(usize, u64)> = m_iter.collect(); for (sort, hash_key) in data { let (idx, compacted) = hash_config.compact(hash_key); @@ -103,12 +103,11 @@ fn process_seq( let taxid = chtable.get_from_page(index, compacted, partition_index + 1); if taxid > 0 { let high = u32::combined(compacted, taxid, value_bits); - let row = Row::new(high, 0, sort as u32 + 1); + let row = Row::new(high, 0, sort as u32 + 1 + offset as u32); rows.push(row); } } - - HitGroup::new(m_iter.size, rows, 0) + m_iter.size + offset } fn process_record( @@ -120,23 +119,31 @@ fn process_record( cur_taxon_counts: &TaxonCountersDash, classify_counter: &AtomicUsize, ) -> String { - let id = &marker.header.id; - let hits = marker - .body - .apply_mut(|m_iter| process_seq(m_iter, &hash_config, chtable)); - let total_kmers = hits.total_marker_size(); + let id = &marker.header.id.clone(); + let rows: Vec = marker + .fold(|rows, m_iter, offset| process_seq(rows, m_iter, &hash_config, chtable, offset)); + + let hits = HitGroup::new(rows, marker.range()); + let seq_len_str = marker.fmt_seq_size(); + let required_score = hits.required_score(args.confidence_threshold); let hit_data = process_hitgroup( &hits, - hash_config, taxonomy, - cur_taxon_counts, classify_counter, - total_kmers, - args.confidence_threshold, + required_score, args.minimum_hit_groups, + hash_config.value_mask, ); + + hit_data.3.iter().for_each(|(key, value)| { + cur_taxon_counts + .entry(*key) + .or_default() + .merge(value) + .unwrap(); + }); format!( "{}\t{}\t{}\t{}\t{}\n", hit_data.0, id, hit_data.1, seq_len_str, hit_data.2 @@ -269,7 +276,9 @@ fn process_files( file_writer.flush().unwrap(); let score = args.minimum_quality_score; - let mut reader: Box = create_reader(file_pair, file_index, score)?; + let paths = OptionPair::from_slice(file_pair); + let mut reader = FastxReader::from_paths(paths, file_index, score)?; + // let mut reader = create_reader(file_pair, file_index, score)?; let (thread_sequences, thread_unclassified) = process_fastx_file( &args, meros, diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs index 5b182b6..8f8d39e 100644 --- a/kr2r/src/bin/estimate_capacity.rs +++ b/kr2r/src/bin/estimate_capacity.rs @@ -94,16 +94,14 @@ fn process_sequence( let mut minimizer_set = HashSet::new(); for record in record_set { - record - .body - .fold(&mut minimizer_set, |minimizer_set, m_iter| { - let kmer_iter: HashSet = m_iter - .filter(|(_, hash_key)| *hash_key & RANGE_MASK < range_n) - .map(|(_, hash_key)| hash_key) - .collect(); - - minimizer_set.extend(kmer_iter); - }); + record.body.apply_mut(|m_iter| { + let kmer_iter: HashSet = m_iter + .filter(|(_, hash_key)| *hash_key & RANGE_MASK < range_n) + .map(|(_, hash_key)| hash_key) + .collect(); + + minimizer_set.extend(kmer_iter); + }); } Some(minimizer_set) }, diff --git a/kr2r/src/bin/resolve.rs b/kr2r/src/bin/resolve.rs index 5fd79d6..8c3c96b 100644 --- a/kr2r/src/bin/resolve.rs +++ b/kr2r/src/bin/resolve.rs @@ -1,13 +1,14 @@ use clap::Parser; use dashmap::{DashMap, DashSet}; -use kr2r::classify::{add_hitlist_string, count_values, resolve_tree}; +use kr2r::classify::process_hitgroup; use kr2r::compact_hash::{HashConfig, Row}; use kr2r::readcounts::{TaxonCounters, TaxonCountersDash}; use kr2r::report::report_kraken_style; use kr2r::taxonomy::Taxonomy; use kr2r::utils::{find_and_sort_files, open_file}; +use kr2r::HitGroup; use rayon::prelude::*; -use seqkmer::trim_pair_info; +use seqkmer::{trim_pair_info, OptionPair}; use std::collections::HashMap; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}; @@ -20,7 +21,7 @@ const BATCH_SIZE: usize = 8 * 1024 * 1024; pub fn read_id_to_seq_map>( filename: P, -) -> Result)>> { +) -> Result)>> { let file = open_file(filename)?; let reader = BufReader::new(file); let id_map = DashMap::new(); @@ -35,9 +36,9 @@ pub fn read_id_to_seq_map>( let seq_id = parts[1].to_string(); let seq_size = parts[2].to_string(); let count_parts: Vec<&str> = parts[3].split('|').collect(); - let kmer_count1 = count_parts[0].parse::().unwrap(); + let kmer_count1 = count_parts[0].parse::().unwrap(); let kmer_count2 = if count_parts.len() > 1 { - count_parts[1].parse::().map_or(None, |i| Some(i)) + count_parts[1].parse::().map_or(None, |i| Some(i)) } else { None }; @@ -106,7 +107,7 @@ fn process_batch>( sample_file: P, args: &Args, taxonomy: &Taxonomy, - id_map: &DashMap)>, + id_map: &DashMap)>, writer: &Mutex>, value_mask: usize, ) -> Result<(TaxonCountersDash, usize, DashSet)> { @@ -148,39 +149,50 @@ fn process_batch>( hit_counts.into_par_iter().for_each(|(k, mut rows)| { if let Some(item) = id_map.get(&k) { rows.sort_unstable(); - let total_kmers: usize = item.2 as usize + item.3.unwrap_or(0) as usize; let dna_id = trim_pair_info(&item.0); - let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, item.2); - let hit_string = add_hitlist_string(&rows, value_mask, item.2, item.3, taxonomy); - let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold); - if call > 0 && hit_groups < minimum_hit_groups { - call = 0; - }; - - cur_counts.iter().for_each(|entry| { + let range = OptionPair::from(((0, item.2), item.3.map(|size| (item.2, size + item.2)))); + let hits = HitGroup::new(rows, range); + + let hit_data = process_hitgroup( + &hits, + taxonomy, + &classify_counter, + hits.required_score(confidence_threshold), + minimum_hit_groups, + value_mask, + ); + // let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, item.2); + // let hit_string = add_hitlist_string(&rows, value_mask, item.2, item.3, taxonomy); + // let require_score = (confidence_threshold * total_kmers as f64).ceil() as u64; + // let mut call = resolve_tree(&counts, taxonomy, require_score); + // if call > 0 && hit_groups < minimum_hit_groups { + // call = 0; + // }; + + hit_data.3.iter().for_each(|(key, value)| { cur_taxon_counts - .entry(*entry.key()) + .entry(*key) .or_default() - .merge(entry.value()) + .merge(value) .unwrap(); }); - let ext_call = taxonomy.nodes[call as usize].external_id; - let clasify = if call > 0 { - classify_counter.fetch_add(1, Ordering::SeqCst); - cur_taxon_counts - .entry(call as u64) - .or_default() - .increment_read_count(); - - "C" - } else { - "U" - }; + // let ext_call = taxonomy.nodes[call as usize].external_id; + // let clasify = if call > 0 { + // classify_counter.fetch_add(1, Ordering::SeqCst); + // cur_taxon_counts + // .entry(call as u64) + // .or_default() + // .increment_read_count(); + + // "C" + // } else { + // "U" + // }; // 使用锁来同步写入 let output_line = format!( "{}\t{}\t{}\t{}\t{}\n", - clasify, dna_id, ext_call, item.1, hit_string + hit_data.0, dna_id, hit_data.1, item.1, hit_data.2 ); let mut file = writer.lock().unwrap(); file.write_all(output_line.as_bytes()).unwrap(); @@ -241,8 +253,13 @@ pub fn run(args: Args) -> Result<()> { .filter(|item| !hit_seq_set.contains(item.key())) .for_each(|item| { let dna_id = trim_pair_info(&item.0); - let hit_string = add_hitlist_string(&vec![], value_mask, item.2, item.3, &taxo); - let output_line = format!("U\t{}\t0\t{}\t{}\n", dna_id, item.1, hit_string); + let output_line = format!( + "U\t{}\t0\t{}\t{}\n", + dna_id, + item.1, + if item.3.is_none() { "" } else { " |:| " } + ); + let mut file = writer.lock().unwrap(); file.write_all(output_line.as_bytes()).unwrap(); }); diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index cd8749b..41f37dc 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -5,7 +5,7 @@ use kr2r::utils::{ get_lastest_file_index, }; use kr2r::IndexOptions; -use seqkmer::{create_reader, read_parallel, Meros, MinimizerIterator, Reader}; +use seqkmer::{read_parallel, FastxReader, Meros, MinimizerIterator, OptionPair, Reader}; use std::fs; use std::io::{BufWriter, Write}; use std::io::{Error, ErrorKind, Result}; @@ -159,11 +159,12 @@ where let header = &seq.header; let index = header.reads_index; let dna_id = header.id.clone(); - seq.body.fold(&mut init, |init, mut m_iter| { - let seq_id = (file_index << 32 | index) as u64; + let seq_id = (file_index << 32 | index) as u64; + + seq.body.apply_mut(|m_iter| { process_record( - init, - &mut m_iter, + &mut init, + m_iter, &hash_config, chunk_size, seq_id, @@ -221,7 +222,8 @@ fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { create_sample_file(args.chunk_dir.join(format!("sample_id_{}.map", file_index))); let score = args.minimum_quality_score; - let mut reader: Box = create_reader(file_pair, file_index, score)?; + let paths = OptionPair::from_slice(file_pair); + let mut reader = FastxReader::from_paths(paths, file_index, score)?; process_fastx_file( &args, meros, diff --git a/kr2r/src/classify.rs b/kr2r/src/classify.rs index be01a43..71229bb 100644 --- a/kr2r/src/classify.rs +++ b/kr2r/src/classify.rs @@ -1,73 +1,72 @@ -use crate::compact_hash::{Compact, HashConfig, Row}; -use crate::readcounts::TaxonCountersDash; +use crate::compact_hash::Compact; +use crate::readcounts::TaxonCounters; use crate::taxonomy::Taxonomy; -use seqkmer::{HitGroup, OptionPair}; +use crate::HitGroup; +use seqkmer::SpaceDist; use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; -fn generate_hit_string( - count: u32, - rows: &Vec, - taxonomy: &Taxonomy, - value_mask: usize, - offset: u32, -) -> String { - let mut result = Vec::new(); - let mut last_pos = 0; - - for row in rows { - if row.kmer_id < offset || row.kmer_id >= offset + count { - continue; - } - let adjusted_pos = row.kmer_id - offset; - - let value = row.value; - let key = value.right(value_mask); - let ext_code = taxonomy.nodes[key as usize].external_id; - - if last_pos == 0 && adjusted_pos > 0 { - result.push((0, adjusted_pos)); // 在开始处添加0 - } else if adjusted_pos - last_pos > 1 { - result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 - } - if let Some(last) = result.last_mut() { - if last.0 == ext_code { - last.1 += 1; - last_pos = adjusted_pos; - continue; - } - } - - // 添加当前key的计数 - result.push((ext_code, 1)); - last_pos = adjusted_pos; - } - - // 填充尾随0 - if last_pos < count - 1 { - if last_pos == 0 { - result.push((0, count - last_pos)); - } else { - result.push((0, count - last_pos - 1)); - } - } - - result - .iter() - .map(|i| format!("{}:{}", i.0, i.1)) - .collect::>() - .join(" ") -} +// fn generate_hit_string( +// count: usize, +// rows: &Vec, +// taxonomy: &Taxonomy, +// value_mask: usize, +// offset: usize, +// ) -> String { +// let mut result = Vec::new(); +// let mut last_pos = 0; + +// for row in rows { +// let sort = row.kmer_id as usize; +// if sort < offset || sort >= offset + count { +// continue; +// } +// let adjusted_pos = row.kmer_id as usize - offset; + +// let value = row.value; +// let key = value.right(value_mask); +// let ext_code = taxonomy.nodes[key as usize].external_id; + +// if last_pos == 0 && adjusted_pos > 0 { +// result.push((0, adjusted_pos)); // 在开始处添加0 +// } else if adjusted_pos - last_pos > 1 { +// result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 +// } +// if let Some(last) = result.last_mut() { +// if last.0 == ext_code { +// last.1 += 1; +// last_pos = adjusted_pos; +// continue; +// } +// } + +// // 添加当前key的计数 +// result.push((ext_code, 1)); +// last_pos = adjusted_pos; +// } + +// // 填充尾随0 +// if last_pos < count - 1 { +// if last_pos == 0 { +// result.push((0, count - last_pos)); +// } else { +// result.push((0, count - last_pos - 1)); +// } +// } + +// result +// .iter() +// .map(|i| format!("{}:{}", i.0, i.1)) +// .collect::>() +// .join(" ") +// } // &HashMap, pub fn resolve_tree( hit_counts: &HashMap, taxonomy: &Taxonomy, - total_minimizers: usize, - confidence_threshold: f64, + required_score: u64, ) -> u32 { - let required_score = (confidence_threshold * total_minimizers as f64).ceil() as u64; - let mut max_taxon = 0u32; let mut max_score = 0; @@ -106,161 +105,115 @@ pub fn resolve_tree( max_taxon } -pub fn add_hitlist_string( - rows: &Vec, +// pub fn add_hitlist_string( +// rows: &Vec, +// value_mask: usize, +// kmer_count1: usize, +// kmer_count2: Option, +// taxonomy: &Taxonomy, +// ) -> String { +// let result1 = generate_hit_string(kmer_count1, &rows, taxonomy, value_mask, 0); +// if let Some(count) = kmer_count2 { +// let result2 = generate_hit_string(count, &rows, taxonomy, value_mask, kmer_count1); +// format!("{} |:| {}", result1, result2) +// } else { +// format!("{}", result1) +// } +// } + +// pub fn count_values( +// rows: &Vec, +// value_mask: usize, +// kmer_count1: u32, +// ) -> (HashMap, TaxonCountersDash, usize) { +// let mut counts = HashMap::new(); + +// let mut hit_count: usize = 0; + +// let mut last_row: Row = Row::new(0, 0, 0); +// let cur_taxon_counts = TaxonCountersDash::new(); + +// for row in rows { +// let value = row.value; +// let key = value.right(value_mask); +// *counts.entry(key).or_insert(0) += 1; + +// // 如果切换到第2条seq,就重新计算 +// if last_row.kmer_id < kmer_count1 && row.kmer_id > kmer_count1 { +// last_row = Row::new(0, 0, 0); +// } +// if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { +// cur_taxon_counts +// .entry(key as u64) +// .or_default() +// .add_kmer(value as u64); +// hit_count += 1; +// } + +// last_row = *row; +// } + +// (counts, cur_taxon_counts, hit_count) +// } + +fn stat_hits<'a>( + hits: &HitGroup, + counts: &mut HashMap, value_mask: usize, - kmer_count1: u32, - kmer_count2: Option, taxonomy: &Taxonomy, + cur_taxon_counts: &mut TaxonCounters, ) -> String { - let result1 = generate_hit_string(kmer_count1, &rows, taxonomy, value_mask, 0); - if let Some(count) = kmer_count2 { - let result2 = generate_hit_string(count, &rows, taxonomy, value_mask, kmer_count1); - format!("{} |:| {}", result1, result2) - } else { - format!("{}", result1) - } -} - -pub fn count_values( - rows: &Vec, - value_mask: usize, - kmer_count1: u32, -) -> (HashMap, TaxonCountersDash, usize) { - let mut counts = HashMap::new(); - - let mut hit_count: usize = 0; - - let mut last_row: Row = Row::new(0, 0, 0); - let cur_taxon_counts = TaxonCountersDash::new(); - - for row in rows { + let mut space_dist = hits.range.apply(|range| SpaceDist::new(*range)); + for row in &hits.rows { let value = row.value; let key = value.right(value_mask); + *counts.entry(key).or_insert(0) += 1; - // 如果切换到第2条seq,就重新计算 - if last_row.kmer_id < kmer_count1 && row.kmer_id > kmer_count1 { - last_row = Row::new(0, 0, 0); - } - if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { - cur_taxon_counts - .entry(key as u64) - .or_default() - .add_kmer(value as u64); - hit_count += 1; - } + cur_taxon_counts + .entry(key as u64) + .or_default() + .add_kmer(value as u64); - last_row = *row; + let ext_code = taxonomy.nodes[key as usize].external_id; + let pos = row.kmer_id as usize; + space_dist.add(ext_code, pos); } - (counts, cur_taxon_counts, hit_count) -} - -fn stat_hits( - hits: &OptionPair>, - counts: &mut HashMap, - value_mask: usize, - taxonomy: &Taxonomy, -) -> (usize, TaxonCountersDash, String) { - // let mut counts = HashMap::new(); - let mut hit_count: usize = 0; - - let cur_taxon_counts = TaxonCountersDash::new(); - let hit_str = hits.apply(|group| { - let mut last_pos = 0; - let count = group.marker_size as u32; - let mut result = Vec::new(); - - // let mut last_row: Row = Row::new(0, 0, 0); - for row in &group.rows { - // 统计计数 - let value = row.value; - let key = value.right(value_mask); - *counts.entry(key).or_insert(0) += 1; - - // if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) { - // cur_taxon_counts - // .entry(key as u64) - // .or_default() - // .add_kmer(value as u64); - // hit_count += 1; - // } - - cur_taxon_counts - .entry(key as u64) - .or_default() - .add_kmer(value as u64); - hit_count += 1; - // last_row = *row; - - let adjusted_pos = row.kmer_id - group.offset; - - let value = row.value; - let key = value.right(value_mask); - let ext_code = taxonomy.nodes[key as usize].external_id; - - if last_pos == 0 && adjusted_pos > 0 { - result.push((0, adjusted_pos)); // 在开始处添加0 - } else if adjusted_pos - last_pos > 1 { - result.push((0, adjusted_pos - last_pos - 1)); // 在两个特定位置之间添加0 - } - if let Some(last) = result.last_mut() { - if last.0 == ext_code { - last.1 += 1; - last_pos = adjusted_pos; - continue; - } - } - - // 添加当前key的计数 - result.push((ext_code, 1)); - last_pos = adjusted_pos; - } - - // 填充尾随0 - if last_pos < count - 1 { - if last_pos == 0 { - result.push((0, count - last_pos)); - } else { - result.push((0, count - last_pos - 1)); - } - } - - result - .iter() - .map(|i| format!("{}:{}", i.0, i.1)) - .collect::>() - .join(" ") - }); - - let hit_string = hit_str.reduce_str(" |:| ", |str| str.to_owned()); - (hit_count, cur_taxon_counts, hit_string) + space_dist.fill_tail_with_zeros(); + space_dist.reduce_str(" |:| ", |str| str.to_string()) } pub fn process_hitgroup( - hits: &OptionPair>, - hash_config: &HashConfig, + hits: &HitGroup, taxonomy: &Taxonomy, - cur_taxon_counts: &TaxonCountersDash, classify_counter: &AtomicUsize, - total_kmers: usize, - confidence_threshold: f64, + required_score: u64, minimum_hit_groups: usize, -) -> (String, u64, String) { - let value_mask = hash_config.value_mask; + value_mask: usize, +) -> (String, u64, String, TaxonCounters) { + // let value_mask = hash_config.value_mask; + let mut cur_taxon_counts = TaxonCounters::new(); let mut counts = HashMap::new(); - let (hit_groups, cur_counts, hit_string) = stat_hits(hits, &mut counts, value_mask, taxonomy); - - cur_counts.iter().for_each(|entry| { - cur_taxon_counts - .entry(*entry.key()) - .or_default() - .merge(entry.value()) - .unwrap(); - }); - let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold); + let hit_groups = hits.capacity(); + let hit_string = stat_hits( + hits, + &mut counts, + value_mask, + taxonomy, + &mut cur_taxon_counts, + ); + + // cur_counts.iter().for_each(|(key, value)| { + // cur_taxon_counts + // .entry(*key) + // .or_default() + // .merge(value) + // .unwrap(); + // }); + + let mut call = resolve_tree(&counts, taxonomy, required_score); if call > 0 && hit_groups < minimum_hit_groups { call = 0; }; @@ -278,5 +231,5 @@ pub fn process_hitgroup( "U" }; - (clasify.to_owned(), ext_call, hit_string) + (clasify.to_owned(), ext_call, hit_string, cur_taxon_counts) } diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index 29a6545..b7e8600 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -266,7 +266,7 @@ pub fn convert_fna_to_k2_format>( for record in seqs { let header = &record.header; - record.body.fold(&mut k2_cell_list, |k2_cell_list, m_iter| { + record.body.apply_mut(|m_iter| { if let Some(ext_taxid) = id_to_taxon_map.get(&header.id) { let taxid = taxonomy.get_internal_id(*ext_taxid); let k2_cell: Vec<(usize, Slot)> = m_iter diff --git a/kr2r/src/kr2r_data.rs b/kr2r/src/kr2r_data.rs index 6e522e6..06b4770 100644 --- a/kr2r/src/kr2r_data.rs +++ b/kr2r/src/kr2r_data.rs @@ -1,6 +1,8 @@ +use crate::compact_hash::Row; use crate::utils::open_file; // use crate::{Meros, CURRENT_REVCOM_VERSION}; use seqkmer::Meros; +use seqkmer::OptionPair; use seqkmer::CURRENT_REVCOM_VERSION; use std::fs::File; use std::io::{Read, Result as IoResult, Write}; @@ -30,6 +32,26 @@ pub fn u64_to_option(value: u64) -> Option { Option::from(value).filter(|&x| x != 0) } +pub struct HitGroup { + pub rows: Vec, + /// example: (0..10], 左开右闭 + pub range: OptionPair<(usize, usize)>, +} + +impl HitGroup { + pub fn new(rows: Vec, range: OptionPair<(usize, usize)>) -> Self { + Self { rows, range } + } + + pub fn capacity(&self) -> usize { + self.range.reduce(0, |acc, range| acc + range.1 - range.0) + } + + pub fn required_score(&self, confidence_threshold: f64) -> u64 { + (confidence_threshold * self.capacity() as f64).ceil() as u64 + } +} + /// 顺序不能错 #[repr(C)] #[derive(Debug)] diff --git a/seqkmer/src/fasta.rs b/seqkmer/src/fasta.rs index c2a7c1b..b530134 100644 --- a/seqkmer/src/fasta.rs +++ b/seqkmer/src/fasta.rs @@ -4,6 +4,7 @@ use crate::utils::OptionPair; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; +const SEQ_LIMIT: u64 = u64::pow(2, 32); /// FastaReader pub struct FastaReader where @@ -14,6 +15,9 @@ where reads_index: usize, header: Vec, seq: Vec, + + // 批量读取 + batch_size: usize, } impl FastaReader @@ -21,10 +25,10 @@ where R: Read + Send, { pub fn new(reader: R, file_index: usize) -> Self { - Self::with_capacity(reader, file_index, BUFSIZE) + Self::with_capacity(reader, file_index, BUFSIZE, 30) } - pub fn with_capacity(reader: R, file_index: usize, capacity: usize) -> Self { + pub fn with_capacity(reader: R, file_index: usize, capacity: usize, batch_size: usize) -> Self { assert!(capacity >= 3); Self { reader: BufReader::with_capacity(capacity, reader), @@ -32,26 +36,29 @@ where reads_index: 0, header: Vec::new(), seq: Vec::new(), + batch_size, } } - pub fn read_next_entry(&mut self) -> Result, Vec)>> { - // 读取fastq文件header部分 + pub fn read_next_entry<'a>(&'a mut self) -> Result, &'a Vec)>> { + // 清空header和seq缓冲区 self.header.clear(); + self.seq.clear(); + + // 读取header部分 if self.reader.read_until(b'\n', &mut self.header)? == 0 { return Ok(None); } - let mut header = Vec::with_capacity(self.header.len()); - header.extend_from_slice(&self.header); - // 读取fasta文件seq部分 - self.seq.clear(); + trim_end(&mut self.header); + + // 读取seq部分 if self.reader.read_until(b'>', &mut self.seq)? == 0 { return Ok(None); } trim_end(&mut self.seq); - let mut seq = Vec::with_capacity(self.seq.len()); - seq.extend_from_slice(&self.seq); - Ok(Some((header, seq))) + + // 返回header和seq的引用 + Ok(Some((&self.header, &self.seq))) } pub fn read_next(&mut self) -> Result> { @@ -68,43 +75,24 @@ where trim_end(&mut self.seq); Ok(Some(())) } -} - -impl FastaReader> { - #[inline] - pub fn from_path>(path: P, file_index: usize) -> Result { - let reader = dyn_reader(path)?; - Ok(Self::new(reader, file_index)) - } -} - -fn check_sequence_length(seq: &Vec) -> bool { - let limit = u64::pow(2, 32); - // 检查seq的长度是否大于2的32次方 - (seq.len() as u64) > limit -} - -impl Reader for FastaReader { - fn next(&mut self) -> Result>>>> { - // if self.read_next()?.is_none() { - // return Ok(None); - // } - let entry = self.read_next_entry()?; - if entry.is_none() { + pub fn _next(&mut self) -> Result>)>> { + if self.read_next()?.is_none() { return Ok(None); } - let (header, seq) = entry.unwrap(); - if check_sequence_length(&seq) { + + let seq_len = self.seq.len(); + // 检查seq的长度是否大于2的32次方 + if seq_len as u64 > SEQ_LIMIT { eprintln!("Sequence length exceeds 2^32, which is not handled."); return Ok(None); } let seq_id = unsafe { - let slice = if header.starts_with(b">") { - &header[1..] + let slice = if self.header.starts_with(b">") { + &self.header[1..] } else { - &header[..] + &self.header[..] }; let s = std::str::from_utf8_unchecked(slice); @@ -125,6 +113,39 @@ impl Reader for FastaReader { format: SeqFormat::Fasta, id: seq_id.to_owned(), }; - Ok(Some(vec![Base::new(seq_header, OptionPair::Single(seq))])) + Ok(Some(( + seq_len, + Base::new(seq_header, OptionPair::Single(self.seq.to_owned())), + ))) + } +} + +impl FastaReader> { + #[inline] + pub fn from_path>(path: P, file_index: usize) -> Result { + let reader = dyn_reader(path)?; + Ok(Self::new(reader, file_index)) + } +} + +impl Reader for FastaReader { + fn next(&mut self) -> Result>>>> { + let mut seqs = Vec::new(); + let mut total_bytes = 0; + let max_bytes = 10 * 1024 * 1024; + + for _ in 0..self.batch_size { + if let Some((seq_len, seq)) = self._next()? { + seqs.push(seq); + total_bytes += seq_len; + if total_bytes > max_bytes { + break; + } + } else { + break; + } + } + + Ok(if seqs.is_empty() { None } else { Some(seqs) }) } } diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs index 4e65337..292b6f0 100644 --- a/seqkmer/src/fastq.rs +++ b/seqkmer/src/fastq.rs @@ -1,5 +1,6 @@ use crate::reader::{dyn_reader, trim_end, trim_pair_info, Reader, BUFSIZE}; use crate::seq::{Base, SeqFormat, SeqHeader}; +use crate::utils::OptionPair; use std::io::{BufRead, BufReader, Read, Result}; use std::path::Path; @@ -29,43 +30,6 @@ where } } - pub fn read_next_entry(&mut self) -> Result, Vec)>> { - // 读取fastq文件header部分 - let mut header: Vec = Vec::new(); - if self.reader.read_until(b'\n', &mut header)? == 0 { - return Ok(None); - } - // 读取fastq文件seq部分 - let mut seq = Vec::new(); - if self.reader.read_until(b'\n', &mut seq)? == 0 { - return Ok(None); - } - trim_end(&mut seq); - - // 读取fastq文件+部分 - self.plus.clear(); - if self.reader.read_until(b'\n', &mut self.plus)? == 0 { - return Ok(None); - } - - // 读取fastq文件quals部分 - self.quals.clear(); - if self.reader.read_until(b'\n', &mut self.quals)? == 0 { - return Ok(None); - } - trim_end(&mut self.quals); - - if self.quality_score > 0 { - for (base, &qscore) in self.seq.iter_mut().zip(self.quals.iter()) { - if (qscore as i32 - '!' as i32) < self.quality_score { - *base = b'x'; - } - } - } - - Ok(Some((header, seq))) - } - pub fn read_next(&mut self) -> Result> { // 读取fastq文件header部分 self.header.clear(); @@ -104,229 +68,7 @@ where } } -/// FastqReader pub struct FastqReader { - inner: QReader, - file_index: usize, - reads_index: usize, - // 批量读取 - batch_size: usize, -} - -impl FastqReader -where - R: Read + Send, -{ - pub fn new(reader: R, file_index: usize, quality_score: i32) -> Self { - Self::with_capacity(reader, file_index, BUFSIZE, quality_score, 30) - } - - pub fn with_capacity<'a>( - reader: R, - file_index: usize, - capacity: usize, - quality_score: i32, - batch_size: usize, - ) -> Self { - assert!(capacity >= 3); - Self { - inner: QReader::with_capacity(reader, capacity, quality_score), - file_index, - reads_index: 0, - batch_size, - } - } - - pub fn read_next_entry(&mut self) -> Result>>> { - let entry = self.inner.read_next_entry()?; - if entry.is_none() { - return Ok(None); - } - - let (header, seq) = entry.unwrap(); - - let seq_id = unsafe { - let s = std::str::from_utf8_unchecked(&header[1..]); - let first_space_index = s - .as_bytes() - .iter() - .position(|&c| c == b' ') - .unwrap_or(s.len()); - - // 直接从原始切片创建第一个单词的切片 - &s[..first_space_index] - }; - self.reads_index += 1; - - let seq_header = SeqHeader { - file_index: self.file_index, - reads_index: self.reads_index, - format: SeqFormat::Fasta, - id: seq_id.to_owned(), - }; - - let seq = Base::new(seq_header, OptionPair::Single(seq)); - Ok(Some(seq)) - } - - pub fn read_next(&mut self) -> Result>>> { - if self.inner.read_next()?.is_none() { - return Ok(None); - } - - let seq_id = unsafe { - let s = std::str::from_utf8_unchecked(&self.inner.header[1..]); - let first_space_index = s - .as_bytes() - .iter() - .position(|&c| c == b' ') - .unwrap_or(s.len()); - - // 直接从原始切片创建第一个单词的切片 - &s[..first_space_index] - }; - self.reads_index += 1; - - let seq_header = SeqHeader { - file_index: self.file_index, - reads_index: self.reads_index, - format: SeqFormat::Fasta, - id: seq_id.to_owned(), - }; - - let seq = Base::new(seq_header, OptionPair::Single(self.inner.seq.to_owned())); - Ok(Some(seq)) - } -} - -impl FastqReader> { - #[inline] - pub fn from_path>( - path: P, - file_index: usize, - quality_score: i32, - ) -> Result { - let reader = dyn_reader(path)?; - Ok(Self::new(reader, file_index, quality_score)) - } -} - -impl Reader for FastqReader -where - R: Read + Send, -{ - fn next(&mut self) -> Result>>>> { - let seqs: Vec>> = (0..self.batch_size) - .filter_map(|_| self.read_next_entry().transpose()) // 将 Result, _> 转换为 Option> - .collect::>>()?; - - Ok(Some(seqs).filter(|v| !v.is_empty())) - } -} - -/// FastqPairReader -pub struct FastqPairReader { - inner1: QReader, - inner2: QReader, - file_index: usize, - reads_index: usize, - // 批量读取 - batch_size: usize, -} - -impl FastqPairReader -where - R: Read + Send, -{ - pub fn new(reader1: R, reader2: R, file_index: usize, score: i32) -> Self { - Self::with_capacity(reader1, reader2, file_index, BUFSIZE, score, 30) - } - - pub fn with_capacity<'a>( - reader1: R, - reader2: R, - file_index: usize, - capacity: usize, - score: i32, - batch_size: usize, - ) -> Self { - assert!(capacity >= 3); - Self { - inner1: QReader::with_capacity(reader1, capacity, score), - inner2: QReader::with_capacity(reader2, capacity, score), - file_index, - reads_index: 0, - batch_size, - } - } - - pub fn read_next(&mut self) -> Result>>> { - if self.inner1.read_next()?.is_none() { - return Ok(None); - } - - if self.inner2.read_next()?.is_none() { - return Ok(None); - } - - let seq_id = unsafe { - let s = std::str::from_utf8_unchecked(&self.inner1.header[1..]); - let first_space_index = s - .as_bytes() - .iter() - .position(|&c| c == b' ') - .unwrap_or(s.len()); - - // 直接从原始切片创建第一个单词的切片 - &s[..first_space_index] - }; - self.reads_index += 1; - - let seq_header = SeqHeader { - file_index: self.file_index, - reads_index: self.reads_index, - format: SeqFormat::Fasta, - id: trim_pair_info(seq_id), - }; - - let sequence = Base::new( - seq_header, - OptionPair::Pair(self.inner1.seq.to_owned(), self.inner2.seq.to_owned()), - ); - Ok(Some(sequence)) - } -} - -impl FastqPairReader> { - #[inline] - pub fn from_path>( - path1: P, - path2: P, - file_index: usize, - quality_score: i32, - ) -> Result { - let reader1 = dyn_reader(path1)?; - let reader2 = dyn_reader(path2)?; - Ok(Self::new(reader1, reader2, file_index, quality_score)) - } -} - -impl Reader for FastqPairReader -where - R: Read + Send, -{ - fn next(&mut self) -> Result>>>> { - let seqs: Vec>> = (0..self.batch_size) - .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> - .collect::>>()?; - - Ok(Some(seqs).filter(|v| !v.is_empty())) - } -} - -use crate::utils::OptionPair; - -pub struct FastxPairReader { inner: OptionPair>, file_index: usize, reads_index: usize, @@ -334,7 +76,7 @@ pub struct FastxPairReader { batch_size: usize, } -impl FastxPairReader +impl FastqReader where R: Read + Send, { @@ -424,7 +166,7 @@ where } } -impl FastxPairReader> { +impl FastqReader> { #[inline] pub fn from_path>( paths: OptionPair

, @@ -436,7 +178,7 @@ impl FastxPairReader> { } } -impl Reader for FastxPairReader +impl Reader for FastqReader where R: Read + Send, { diff --git a/seqkmer/src/fastx.rs b/seqkmer/src/fastx.rs new file mode 100644 index 0000000..3dec5b9 --- /dev/null +++ b/seqkmer/src/fastx.rs @@ -0,0 +1,45 @@ +use crate::fasta::FastaReader; +use crate::fastq::FastqReader; +use crate::reader::{detect_file_format, Reader}; +use crate::seq::{Base, SeqFormat}; +use crate::utils::OptionPair; +use std::io::Result; +use std::path::Path; + +pub struct FastxReader { + inner: R, +} + +impl FastxReader { + pub fn new(inner: R) -> Self { + Self { inner } + } +} + +impl Reader for FastxReader { + fn next(&mut self) -> Result>>>> { + self.inner.next() + } +} +impl FastxReader> { + pub fn from_paths>( + paths: OptionPair

, + file_index: usize, + quality_score: i32, + ) -> Result { + let file_format = paths.map(|path: &P| detect_file_format(path)); + + match file_format? { + OptionPair::Single(SeqFormat::Fasta) => { + let reader = FastaReader::from_path(paths.single().unwrap().as_ref(), file_index)?; + Ok(Self::new(Box::new(reader) as Box)) + } + OptionPair::Single(SeqFormat::Fastq) + | OptionPair::Pair(SeqFormat::Fastq, SeqFormat::Fastq) => { + let reader = FastqReader::from_path(paths, file_index, quality_score)?; + Ok(Self::new(Box::new(reader) as Box)) + } + _ => panic!("Unsupported file format combination"), + } + } +} diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs index 6332f98..3beae3e 100644 --- a/seqkmer/src/lib.rs +++ b/seqkmer/src/lib.rs @@ -1,5 +1,6 @@ mod fasta; mod fastq; +mod fastx; mod feat; mod mmscanner; mod parallel; @@ -9,6 +10,7 @@ mod utils; pub use fasta::*; pub use fastq::*; +pub use fastx::*; pub use feat::constants::*; pub use feat::*; pub use mmscanner::MinimizerIterator; diff --git a/seqkmer/src/mmscanner.rs b/seqkmer/src/mmscanner.rs index 9b79cbf..ce2fe70 100644 --- a/seqkmer/src/mmscanner.rs +++ b/seqkmer/src/mmscanner.rs @@ -235,6 +235,34 @@ impl<'a> Base> { pub fn fmt_size(&self) -> String { self.body.reduce_str("|", |m_iter| m_iter.size.to_string()) } + + pub fn fold(&mut self, mut f: F) -> Vec + where + F: FnMut(&mut Vec, &mut MinimizerIterator<'a>, usize) -> usize, + T: Clone, + { + let mut init = Vec::new(); + match &mut self.body { + OptionPair::Single(m_iter) => { + f(&mut init, m_iter, 0); + } + OptionPair::Pair(m_iter1, m_iter2) => { + let offset = f(&mut init, m_iter1, 0); + f(&mut init, m_iter2, offset); + } + } + init + } + + pub fn range(&self) -> OptionPair<(usize, usize)> { + match &self.body { + OptionPair::Single(m_iter) => OptionPair::Single((0, m_iter.size)), + OptionPair::Pair(m_iter1, m_iter2) => { + let size1 = m_iter1.size; + OptionPair::Pair((0, size1), (size1, m_iter2.size + size1)) + } + } + } } pub fn scan_sequence<'a>( diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index 47a4ada..59288a7 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -1,9 +1,7 @@ -use crate::fasta::FastaReader; -use crate::fastq::{FastqPairReader, FastqReader}; use crate::mmscanner::{scan_sequence, MinimizerIterator}; -use crate::reader::{detect_file_format, Reader}; -use crate::seq::Base; -use crate::{Meros, SeqFormat}; +use crate::reader::Reader; +use crate::seq::{Base, SeqFormat}; +use crate::{detect_file_format, FastaReader, FastqReader, Meros}; use crossbeam_channel::{bounded, Receiver}; use scoped_threadpool::Pool; use std::io::Result; @@ -30,21 +28,13 @@ pub fn create_reader( file_pair: &[String], file_index: usize, score: i32, -) -> Result> { - let mut files_iter = file_pair.iter(); - let file1 = files_iter.next().cloned().unwrap(); - let file2 = files_iter.next().cloned(); +) -> Result> { + // let mut files_iter = file_pair.iter(); + let paths = crate::OptionPair::from_slice(file_pair); + match detect_file_format(&file_pair[0])? { - SeqFormat::Fastq => { - if let Some(file2) = file2 { - Ok(Box::new(FastqPairReader::from_path( - file1, file2, file_index, score, - )?)) - } else { - Ok(Box::new(FastqReader::from_path(file1, file_index, score)?)) - } - } - SeqFormat::Fasta => Ok(Box::new(FastaReader::from_path(file1, file_index)?)), + SeqFormat::Fastq => Ok(Box::new(FastqReader::from_path(paths, file_index, score)?)), + SeqFormat::Fasta => Ok(Box::new(FastaReader::from_path(&file_pair[0], file_index)?)), } } diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs index 7a7f202..77309d7 100644 --- a/seqkmer/src/reader.rs +++ b/seqkmer/src/reader.rs @@ -44,13 +44,8 @@ pub fn open_file>(path: P) -> Result { } pub(crate) fn detect_file_format>(path: P) -> io::Result { - let mut file = open_file(path)?; - let read1: Box = if is_gzipped(&mut file)? { - Box::new(GzDecoder::new(file)) - } else { - Box::new(file) - }; - + // let mut file = open_file(path)?; + let read1: Box = dyn_reader(path)?; let reader = BufReader::new(read1); let mut lines = reader.lines(); @@ -89,60 +84,119 @@ pub(crate) fn trim_end(buffer: &mut Vec) { pub const BUFSIZE: usize = 16 * 1024 * 1024; -// pub type SeqVecType = Vec; - pub trait Reader: Send { fn next(&mut self) -> Result>>>>; } -impl Reader for Box { +impl Reader for Box { fn next(&mut self) -> Result>>>> { (**self).next() } } #[derive(Debug)] -pub struct HitGroup { - /// minimizer data size - pub marker_size: usize, - /// hit value vector - pub rows: Vec, - /// pair offset - pub offset: u32, +pub struct PosData { + /// 外部 taxonomy id + pub ext_code: u64, + /// 连续命中次数 + pub count: usize, +} + +impl PosData { + pub fn new(ext_code: u64, count: usize) -> Self { + Self { ext_code, count } + } +} + +impl fmt::Display for PosData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}", self.ext_code, self.count) + } +} + +use std::fmt; + +#[derive(Debug)] +pub struct SpaceDist { + pub value: Vec, + /// example: (0, 10], 左开右闭 + pub range: (usize, usize), + pos: usize, } -impl HitGroup { - pub fn new(marker_size: usize, rows: Vec, offset: u32) -> Self { +impl SpaceDist { + pub fn new(range: (usize, usize)) -> Self { Self { - marker_size, - rows, - offset, + value: Vec::new(), + range, + pos: range.0, + } + } + + fn fill_with_zeros(&mut self, gap: usize) { + if gap > 0 { + self.value.push(PosData::new(0, gap)); + } + } + + pub fn add(&mut self, ext_code: u64, pos: usize) { + if pos <= self.pos || pos > self.range.1 { + return; // 早期返回,不做任何处理 + } + let gap = pos - self.pos - 1; + + if gap > 0 { + self.fill_with_zeros(gap); + } + + if let Some(last) = self.value.last_mut() { + if last.ext_code == ext_code { + last.count += 1; + } else { + self.value.push(PosData::new(ext_code, 1)); + } + } else { + self.value.push(PosData::new(ext_code, 1)); + } + self.pos = pos; + } + + /// Fills the end of the distribution with zeros if there is remaining space. + pub fn fill_tail_with_zeros(&mut self) { + if self.pos < self.range.1 { + self.fill_with_zeros(self.range.1 - self.pos); + self.pos = self.range.1; } } } -impl OptionPair> { - /// Synchronizes the offset of the second element of a `Pair` to the `cap` of the first element. - /// This alignment is only necessary when the `rows` property of the `HitGroup` is in an - /// increasing order. If `rows` is not increasing, aligning the offset based on `cap` may not - /// be appropriate or required. - /// - /// # Example - /// - /// ``` - /// let mut hit_group1 = HitGroup::new(10, vec![1, 2, 3], 0); // Increasing `rows` - /// let mut hit_group2 = HitGroup::new(20, vec![4, 5, 6], 0); - /// - /// let mut pair = OptionPair::Pair((hit_group1, hit_group2)); - /// pair.align_offset(); - /// ``` - pub fn align_offset(&mut self) { - if let OptionPair::Pair(ref first, ref mut second) = self { - second.offset = first.marker_size as u32; +impl fmt::Display for SpaceDist { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for (i, data) in self.value.iter().enumerate() { + if i > 0 { + write!(f, " ")?; + } + write!(f, "{}", data)?; + } + write!(f, "") + } +} + +impl OptionPair { + pub fn add(&mut self, ext_code: u64, pos: usize) { + match self { + OptionPair::Single(sd) => sd.add(ext_code, pos), + OptionPair::Pair(sd1, sd2) => { + if pos > sd1.range.1 { + sd2.add(ext_code, pos) + } else { + sd1.add(ext_code, pos) + } + } } } - pub fn total_marker_size(&self) -> usize { - self.reduce(0, |acc, hit| acc + hit.marker_size) + pub fn fill_tail_with_zeros(&mut self) { + self.apply_mut(|sd| sd.fill_tail_with_zeros()); } } diff --git a/seqkmer/src/utils.rs b/seqkmer/src/utils.rs index 56f5b6c..f5e77bc 100644 --- a/seqkmer/src/utils.rs +++ b/seqkmer/src/utils.rs @@ -5,6 +5,13 @@ pub enum OptionPair { } impl OptionPair { + pub fn single(&self) -> Option<&T> { + match self { + OptionPair::Single(value) => Some(value), + _ => None, + } + } + // 它接受一个泛型闭包 F,并返回一个新的 OptionPair pub fn map(&self, mut f: F) -> Result, E> where @@ -20,6 +27,18 @@ impl OptionPair { } } + // pub fn concat(&self, init: &mut U, mut f: F) -> V + // where + // F: FnMut(&mut U, &T) -> V, + // { + // match self { + // OptionPair::Single(t) => f(init, t), + // OptionPair::Pair(t1, t2) => { + // f(init, t1); + // f(init, t2) + // } + // } + // } pub fn reduce(&self, init: U, mut f: F) -> U where F: FnMut(U, &T) -> U, @@ -33,20 +52,6 @@ impl OptionPair { } } - pub fn fold(&mut self, init: &mut V, mut func: F) -> OptionPair - where - F: FnMut(&mut V, &mut T) -> U, - { - match self { - OptionPair::Single(seq) => OptionPair::Single(func(init, seq)), - OptionPair::Pair(ref mut seq1, ref mut seq2) => { - let res1 = func(init, seq1); - let res2 = func(init, seq2); - OptionPair::Pair(res1, res2) - } - } - } - pub fn reduce_str(&self, sep: &str, mut f: F) -> String where F: FnMut(&T) -> String, From 836a3430dbe01362b7bce51e41b0dca32ed07bd8 Mon Sep 17 00:00:00 2001 From: dagou Date: Sat, 22 Jun 2024 10:31:46 +0800 Subject: [PATCH 13/18] seq kmer --- kr2r/src/bin/annotate.rs | 2 +- kr2r/src/bin/build_k2_db.rs | 15 ++++++--------- kr2r/src/bin/classify.rs | 3 +-- kr2r/src/bin/estimate_capacity.rs | 3 ++- kr2r/src/bin/hashshard.rs | 2 +- kr2r/src/bin/kun.rs | 11 ++--------- kr2r/src/bin/resolve.rs | 2 +- kr2r/src/bin/splitr.rs | 3 +-- kr2r/src/db.rs | 2 -- seqkmer/src/fastq.rs | 2 +- seqkmer/src/parallel.rs | 3 +-- 11 files changed, 17 insertions(+), 31 deletions(-) diff --git a/kr2r/src/bin/annotate.rs b/kr2r/src/bin/annotate.rs index b762f5f..b685872 100644 --- a/kr2r/src/bin/annotate.rs +++ b/kr2r/src/bin/annotate.rs @@ -235,7 +235,7 @@ pub fn run(args: Args) -> Result<()> { // 开始计时 let start = Instant::now(); - println!("start..."); + println!("annotate start..."); for chunk_file in chunk_files { println!("chunk_file {:?}", chunk_file); process_chunk_file(&args, chunk_file, &hash_files)?; diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs index 1eb1c17..0663471 100644 --- a/kr2r/src/bin/build_k2_db.rs +++ b/kr2r/src/bin/build_k2_db.rs @@ -7,8 +7,8 @@ use kr2r::db::{ write_config_to_file, }; use kr2r::utils::{ - create_partition_files, create_partition_writers, find_library_fna_files, format_bytes, - get_file_limit, read_id_to_taxon_map, + create_partition_files, create_partition_writers, find_library_fna_files, get_file_limit, + read_id_to_taxon_map, }; use kr2r::IndexOptions; use std::fs::remove_file; @@ -25,10 +25,9 @@ pub struct Args { #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")] pub hash_capacity: usize, - /// chunk temp directory - #[clap(long)] - pub chunk_dir: PathBuf, - + // chunk temp directory + // #[clap(long)] + // pub chunk_dir: PathBuf, /// 包含原始配置 #[clap(flatten)] pub build: Build, @@ -83,11 +82,9 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box Result<()> { if hash_config.hash_capacity == 0 { panic!("`hash_capacity` can't be zero!"); } - println!("start..."); + println!("classify start..."); let start = Instant::now(); let meros = idx_opts.as_meros(); let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?; diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs index 8f8d39e..5cc42dc 100644 --- a/kr2r/src/bin/estimate_capacity.rs +++ b/kr2r/src/bin/estimate_capacity.rs @@ -88,7 +88,6 @@ fn process_sequence( read_parallel( &mut reader, args.threads, - args.threads - 2, &meros, |record_set| { let mut minimizer_set = HashSet::new(); @@ -152,6 +151,8 @@ pub fn run(args: Args) -> usize { panic!("Error: No library.fna files found in the specified directory. Please ensure that the directory contains at least one library.fna file and try again."); } + println!("estimate start... "); + for fna_file in fna_files { let args_clone = Args { database: source.clone(), diff --git a/kr2r/src/bin/hashshard.rs b/kr2r/src/bin/hashshard.rs index 8fce0d8..e5b5256 100644 --- a/kr2r/src/bin/hashshard.rs +++ b/kr2r/src/bin/hashshard.rs @@ -69,7 +69,7 @@ pub fn run(args: Args) -> IOResult<()> { let hash_config = HashConfig::from_hash_header(index_filename)?; let partition = (hash_config.capacity + args.hash_capacity - 1) / args.hash_capacity; - println!("start..."); + println!("hashshard start..."); // 开始计时 let start = Instant::now(); diff --git a/kr2r/src/bin/kun.rs b/kr2r/src/bin/kun.rs index 756181e..425c091 100644 --- a/kr2r/src/bin/kun.rs +++ b/kr2r/src/bin/kun.rs @@ -22,13 +22,7 @@ struct BuildArgs { #[clap(long)] pub k2d_dir: Option, - /// chunk directory - #[clap(long)] - chunk_dir: PathBuf, - - #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")] - pub hash_capacity: usize, - + // chunk_dir: PathBuf, #[clap(flatten)] pub build: Build, @@ -117,8 +111,7 @@ impl From for build_k2_db::Args { build: item.build, k2d_dir: item.k2d_dir, taxo: item.taxo, - chunk_dir: item.chunk_dir, - hash_capacity: item.hash_capacity, + hash_capacity: parse_size("1G").unwrap(), } } } diff --git a/kr2r/src/bin/resolve.rs b/kr2r/src/bin/resolve.rs index 8c3c96b..9653d21 100644 --- a/kr2r/src/bin/resolve.rs +++ b/kr2r/src/bin/resolve.rs @@ -223,7 +223,7 @@ pub fn run(args: Args) -> Result<()> { // 开始计时 let start = Instant::now(); - println!("start..."); + println!("resolve start..."); for i in 0..partition { let sample_file = &sample_files[i]; diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index 41f37dc..e276407 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -148,7 +148,6 @@ where read_parallel( reader, - args.num_threads as usize - 2, args.num_threads as usize, &meros, |seqs| { @@ -268,7 +267,7 @@ pub fn run(args: Args) -> Result<()> { if hash_config.hash_capacity == 0 { panic!("`hash_capacity` can't be zero!"); } - println!("start..."); + println!("splitr start..."); let file_num_limit = get_file_limit(); if hash_config.partition >= file_num_limit { panic!("Exceeds File Number Limit"); diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index b7e8600..e480f1f 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -252,14 +252,12 @@ pub fn convert_fna_to_k2_format>( threads: u32, ) { let mut reader = FastaReader::from_path(fna_file, 1).unwrap(); - let queue_len = (threads - 2) as usize; let value_bits = hash_config.value_bits; let cell_size = std::mem::size_of::>(); read_parallel( &mut reader, threads as usize, - queue_len, &meros, |seqs| { let mut k2_cell_list = Vec::new(); diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs index 292b6f0..218b863 100644 --- a/seqkmer/src/fastq.rs +++ b/seqkmer/src/fastq.rs @@ -184,7 +184,7 @@ where { fn next(&mut self) -> Result>>>> { let seqs: Vec>> = (0..self.batch_size) - .filter_map(|_| self.read_next().transpose()) // 将 Result, _> 转换为 Option> + .filter_map(|_| self.read_next().transpose()) .collect::>>()?; Ok(Some(seqs).filter(|v| !v.is_empty())) diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs index 59288a7..ab09045 100644 --- a/seqkmer/src/parallel.rs +++ b/seqkmer/src/parallel.rs @@ -41,7 +41,6 @@ pub fn create_reader( pub fn read_parallel( reader: &mut R, n_threads: usize, - buffer_len: usize, meros: &Meros, work: W, func: F, @@ -54,7 +53,7 @@ where F: FnOnce(&mut ParallelResult>) -> Out + Send, { assert!(n_threads > 2); - assert!(n_threads <= buffer_len); + let buffer_len = n_threads + 2; let (sender, receiver) = bounded::>>>(buffer_len); let (done_send, done_recv) = bounded::>(buffer_len); let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver From 0d19bce111d8c8af927fac7884f7490c3ecbc944 Mon Sep 17 00:00:00 2001 From: dagou Date: Sat, 22 Jun 2024 21:34:51 +0800 Subject: [PATCH 14/18] merge fna --- kr2r/Cargo.toml | 1 + kr2r/src/args.rs | 37 ++--- kr2r/src/bin/build_k2_db.rs | 23 ++-- kr2r/src/bin/classify.rs | 6 +- kr2r/src/bin/estimate_capacity.rs | 10 +- kr2r/src/bin/kun.rs | 48 +++++-- kr2r/src/bin/merge_fna.rs | 219 ++++++++++++++++++++++++++++++ kr2r/src/bin/splitr.rs | 8 +- kr2r/src/db.rs | 4 +- kr2r/src/utils.rs | 16 ++- ncbi/Cargo.toml | 1 + ncbi/src/fna.rs | 22 ++- ncbi/src/load.rs | 4 +- ncbi/src/main.rs | 47 +++---- ncbi/src/task.rs | 6 +- 15 files changed, 349 insertions(+), 103 deletions(-) create mode 100644 kr2r/src/bin/merge_fna.rs diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml index e662178..5826ace 100644 --- a/kr2r/Cargo.toml +++ b/kr2r/Cargo.toml @@ -28,6 +28,7 @@ libc = "0.2" regex = "1.5.4" flate2 = "1.0" dashmap = { version = "5.5.3", features = ["rayon"] } +num_cpus = "1.13.1" [dev-dependencies] criterion = "0.5.1" diff --git a/kr2r/src/args.rs b/kr2r/src/args.rs index aed1933..059dad8 100644 --- a/kr2r/src/args.rs +++ b/kr2r/src/args.rs @@ -31,30 +31,11 @@ pub struct Build { pub requested_bits_for_taxid: u8, /// Number of threads - #[clap(short = 'p', long, default_value_t = 10)] + #[clap(short = 'p', long, default_value_t = num_cpus::get())] pub threads: usize, } -#[derive(Parser, Debug, Clone)] -#[clap(version, about = "taxonomy")] -pub struct Taxo { - // /// Kraken 2 taxonomy filename, default = $database/taxo.k2d - // #[clap(short = 't')] - // pub taxonomy_filename: Option, - - // #[clap(short = 'm', required = true)] - // pub id_to_taxon_map_filename: PathBuf, - /// Sequence ID to taxon map filename - /// seqid2taxid.map file path, default = $database/seqid2taxid.map - #[arg(short = 'm')] - pub id_to_taxon_map_filename: Option, - - /// NCBI taxonomy directory name, default = $database/taxonomy - #[clap(short, long)] - pub ncbi_taxonomy_directory: Option, -} - -const BATCH_SIZE: usize = 8 * 1024 * 1024; +const BATCH_SIZE: usize = 16 * 1024 * 1024; /// Command line arguments for the classify program. /// @@ -78,7 +59,7 @@ pub struct ClassifyArgs { #[clap(long)] pub chunk_dir: PathBuf, - /// Enables use of a Kraken 2 compatible shared database. Default is false. + /// Enables use of a Kraken 2 compatible shared database. #[clap(long, default_value_t = false)] pub kraken_db_type: bool, @@ -94,7 +75,7 @@ pub struct ClassifyArgs { #[clap(short = 'S', long = "single-file-pairs", action)] pub single_file_pairs: bool, - /// Minimum quality score for FASTQ data, default is 0. + /// Minimum quality score for FASTQ data. #[clap( short = 'Q', long = "minimum-quality-score", @@ -103,15 +84,15 @@ pub struct ClassifyArgs { )] pub minimum_quality_score: i32, - /// The number of threads to use, default is 10. - #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = 10)] - pub num_threads: i32, + /// The number of threads to use. + #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = num_cpus::get())] + pub num_threads: usize, - /// 批量处理大小 default: 8MB + /// 批量处理大小 default: 16MB #[clap(long, default_value_t = BATCH_SIZE)] pub batch_size: usize, - /// Confidence score threshold, default is 0.0. + /// Confidence score threshold #[clap( short = 'T', long = "confidence-threshold", diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs index 0663471..84c9886 100644 --- a/kr2r/src/bin/build_k2_db.rs +++ b/kr2r/src/bin/build_k2_db.rs @@ -1,6 +1,6 @@ // 使用时需要引用模块路径 use clap::Parser; -use kr2r::args::{parse_size, Build, Taxo}; +use kr2r::args::{parse_size, Build}; use kr2r::compact_hash::HashConfig; use kr2r::db::{ convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file, @@ -16,7 +16,7 @@ use std::path::PathBuf; use std::time::Instant; #[derive(Parser, Debug, Clone)] -#[clap(author, version, about="build database", long_about = None)] +#[clap(author, version, about="build `k2d` files", long_about = None)] pub struct Args { /// database hash chunk directory and other files #[clap(long)] @@ -31,19 +31,15 @@ pub struct Args { /// 包含原始配置 #[clap(flatten)] pub build: Build, - - #[clap(flatten)] - pub taxo: Taxo, + // #[arg(short = 'm')] + // pub id_to_taxon_map_filename: Option, } pub fn run(args: Args, required_capacity: usize) -> Result<(), Box> { let file_num_limit = get_file_limit(); let meros = args.build.klmt.as_meros(); - let id_to_taxon_map_filename = args - .taxo - .id_to_taxon_map_filename - .unwrap_or(args.build.database.join("seqid2taxid.map")); + let id_to_taxon_map_filename = args.build.database.join("seqid2taxid.map"); let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?; @@ -52,10 +48,7 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box Result<(), Box Result<(), Box String { - let path = Path::new(input_path); +fn build_output_path>(input_path: &P, extension: &str) -> String { + let path = input_path.as_ref(); let parent_dir = path.parent().unwrap_or_else(|| Path::new("")); let stem = path.file_stem().unwrap_or_else(|| path.as_os_str()); @@ -59,8 +59,8 @@ fn build_output_path(input_path: &str, extension: &str) -> String { output_path.to_str().unwrap().to_owned() } -fn process_sequence( - fna_file: &str, +fn process_sequence>( + fna_file: &P, // hllp: &mut HyperLogLogPlus, args: Args, ) -> HyperLogLogPlus { @@ -142,7 +142,7 @@ pub fn run(args: Args) -> usize { let source: PathBuf = args.database.clone(); let fna_files = if source.is_file() { - vec![source.to_string_lossy().to_string()] + vec![source.clone()] } else { find_library_fna_files(args.database) }; diff --git a/kr2r/src/bin/kun.rs b/kr2r/src/bin/kun.rs index 425c091..16ebdd1 100644 --- a/kr2r/src/bin/kun.rs +++ b/kr2r/src/bin/kun.rs @@ -4,31 +4,37 @@ mod build_k2_db; mod classify; mod estimate_capacity; mod hashshard; +mod merge_fna; mod resolve; -mod seqid2taxid; +// mod seqid2taxid; mod splitr; use kr2r::args::ClassifyArgs; -use kr2r::args::{parse_size, Build, Taxo}; +use kr2r::args::{parse_size, Build}; use kr2r::utils::find_and_sort_files; // use std::io::Result; use std::path::PathBuf; use std::time::Instant; #[derive(Parser, Debug, Clone)] -#[clap(author, version, about="build database", long_about = None)] +#[clap(author, version, about="build `k2d` files", long_about = None)] struct BuildArgs { /// database hash chunk directory and other files #[clap(long)] pub k2d_dir: Option, + /// Directory to store downloaded files + #[arg(short, long, default_value = "lib")] + pub download_dir: PathBuf, + // chunk_dir: PathBuf, #[clap(flatten)] pub build: Build, - #[clap(flatten)] - taxo: Taxo, - + // #[arg(short = 'm')] + // pub id_to_taxon_map_filename: Option, + // #[clap(flatten)] + // taxo: Taxo, /// estimate capacity from cache if exists #[arg(long, default_value_t = true)] cache: bool, @@ -110,17 +116,25 @@ impl From for build_k2_db::Args { Self { build: item.build, k2d_dir: item.k2d_dir, - taxo: item.taxo, hash_capacity: parse_size("1G").unwrap(), } } } -impl From for seqid2taxid::Args { +// impl From for seqid2taxid::Args { +// fn from(item: BuildArgs) -> Self { +// Self { +// database: item.build.database, +// id_to_taxon_map_filename: item.taxo.id_to_taxon_map_filename, +// } +// } +// } + +impl From for merge_fna::Args { fn from(item: BuildArgs) -> Self { Self { + download_dir: item.download_dir, database: item.build.database, - id_to_taxon_map_filename: item.taxo.id_to_taxon_map_filename, } } } @@ -128,7 +142,7 @@ impl From for seqid2taxid::Args { #[derive(Subcommand, Debug)] enum Commands { Estimate(estimate_capacity::Args), - Seqid2taxid(seqid2taxid::Args), + // Seqid2taxid(seqid2taxid::Args), Build(BuildArgs), Hashshard(hashshard::Args), Splitr(splitr::Args), @@ -136,21 +150,25 @@ enum Commands { Resolve(resolve::Args), Classify(ClassifyArgs), Direct(classify::Args), + MergeFna(merge_fna::Args), } fn main() -> Result<(), Box> { let args = Args::parse(); match args.cmd { + Commands::MergeFna(cmd_args) => { + merge_fna::run(cmd_args)?; + } Commands::Estimate(cmd_args) => { estimate_capacity::run(cmd_args); } - Commands::Seqid2taxid(cmd_args) => { - seqid2taxid::run(cmd_args)?; - } + // Commands::Seqid2taxid(cmd_args) => { + // seqid2taxid::run(cmd_args)?; + // } Commands::Build(cmd_args) => { - let seq_args = seqid2taxid::Args::from(cmd_args.clone()); - seqid2taxid::run(seq_args)?; + let fna_args = merge_fna::Args::from(cmd_args.clone()); + merge_fna::run(fna_args)?; let ec_args = estimate_capacity::Args::from(cmd_args.clone()); let required_capacity = estimate_capacity::run(ec_args); diff --git a/kr2r/src/bin/merge_fna.rs b/kr2r/src/bin/merge_fna.rs new file mode 100644 index 0000000..f111879 --- /dev/null +++ b/kr2r/src/bin/merge_fna.rs @@ -0,0 +1,219 @@ +use clap::Parser; + +use kr2r::utils::{find_files, open_file}; +use std::collections::HashMap; +use std::fs::{create_dir_all, File, OpenOptions}; +use std::io::{BufRead, BufReader, BufWriter, Result, Write}; +use std::path::PathBuf; + +use flate2::read::GzDecoder; + +#[derive(Parser, Debug, Clone)] +#[clap(version, about = "A tool for processing genomic files")] +pub struct Args { + /// Directory to store downloaded files + #[arg(short, long, default_value = "lib")] + pub download_dir: PathBuf, + + /// ncbi library fna database directory + #[arg(long = "db", required = true)] + pub database: PathBuf, + // /// seqid2taxid.map file path, default = $database/seqid2taxid.map + // #[arg(short = 'm', long)] + // pub id_to_taxon_map_filename: Option, +} + +fn parse_assembly_fna(assembly_file: &PathBuf, site: &str) -> Result> { + let mut gz_files: HashMap = HashMap::new(); + let file = open_file(&assembly_file)?; + let reader = BufReader::new(file); + let lines = reader.lines(); + + let parent_path = assembly_file + .parent() + .expect("Can't find assembly file parent directory"); + for line in lines { + let line = line?; + if line.starts_with('#') { + continue; + } + + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() > 19 { + let (taxid, _, ftp_path) = (fields[5], fields[11], fields[19]); + + if ftp_path == "na" { + continue; + } + + // let levels = vec!["Complete Genome", "Chromosome"]; + // if !levels.contains(&asm_level) { + // continue; + // } + + let fna_file_name = format!( + "{}/{}/{}_genomic.fna.gz", + parent_path.to_string_lossy(), + site, + ftp_path.split('/').last().unwrap_or_default() + ); + gz_files.insert(fna_file_name, taxid.into()); + } + } + Ok(gz_files) +} + +fn process_gz_file( + gz_file: &PathBuf, + map_writer: &mut BufWriter, + fna_writer: &mut BufWriter, + fna_start: ®ex::Regex, + taxid: &str, +) -> Result<()> { + let file = open_file(gz_file)?; + let decompressor = GzDecoder::new(BufReader::new(file)); + let mut reader = BufReader::new(decompressor); + + let mut line = String::new(); + let mut map_buffer = String::new(); // Buffer for map writer + let mut fna_buffer = String::new(); // Buffer for fna writer + + while reader.read_line(&mut line)? != 0 { + if let Some(caps) = fna_start.captures(&line) { + let seqid = &caps[1]; + map_buffer.push_str(&format!("kraken:taxid|{}|{}\t{}\n", taxid, seqid, taxid)); + fna_buffer.push_str(&format!(">kraken:taxid|{}|{}", taxid, &line[1..])); + } else { + fna_buffer.push_str(&line); + } + + // Write to the writers if the buffer size exceeds a certain threshold + if map_buffer.len() > 10000 { + map_writer.write_all(map_buffer.as_bytes())?; + map_buffer.clear(); + } + + if fna_buffer.len() > 10000 { + fna_writer.write_all(fna_buffer.as_bytes())?; + fna_buffer.clear(); + } + + line.clear(); + } + + // Write any remaining buffered content + if !map_buffer.is_empty() { + map_writer.write_all(map_buffer.as_bytes())?; + } + + if !fna_buffer.is_empty() { + fna_writer.write_all(fna_buffer.as_bytes())?; + } + + fna_writer.flush()?; + map_writer.flush()?; + + Ok(()) +} + +const PREFIX: &'static str = "assembly_summary"; +const SUFFIX: &'static str = "txt"; + +fn merge_fna(assembly_files: &Vec, database: &PathBuf) -> Result<()> { + let pattern = format!(r"{}_(\S+)\.{}", PREFIX, SUFFIX); + let file_site = regex::Regex::new(&pattern).unwrap(); + + let library_fna_path = database.join("library.fna"); + let seqid2taxid_path = database.join("seqid2taxid.map"); + let mut fna_writer = BufWriter::new( + OpenOptions::new() + .create(true) + .write(true) + .open(&library_fna_path)?, + ); + let mut map_writer = BufWriter::new( + OpenOptions::new() + .create(true) + .write(true) + .open(&seqid2taxid_path)?, + ); + + let fna_start: regex::Regex = regex::Regex::new(r"^>(\S+)").unwrap(); + for assembly_file in assembly_files { + if let Some(caps) = file_site.captures(assembly_file.to_string_lossy().as_ref()) { + if let Some(matched) = caps.get(1) { + let gz_files = parse_assembly_fna(assembly_file, matched.as_str())?; + + for (gz_path, taxid) in gz_files { + let gz_file = PathBuf::from(&gz_path); + if !gz_file.exists() { + // eprintln!("{} does not exist", gz_file.to_string_lossy()); + continue; + } + + process_gz_file( + &gz_file, + &mut map_writer, + &mut fna_writer, + &fna_start, + &taxid, + )?; + } + + fna_writer.flush()?; + map_writer.flush()?; + } + } + } + + Ok(()) +} + +pub fn run(args: Args) -> Result<()> { + let download_dir = args.download_dir; + let database = &args.database; + + let dst_tax_dir = database.join("taxonomy"); + create_dir_all(&dst_tax_dir)?; + + let source_names_file = &download_dir.join("taxonomy").join("names.dmp"); + assert!(source_names_file.exists()); + let dst_name_file = &dst_tax_dir.join("names.dmp"); + if !dst_name_file.exists() { + std::fs::copy(source_names_file, dst_name_file)?; + } + + let source_nodes_file = &download_dir.join("taxonomy").join("nodes.dmp"); + assert!(source_nodes_file.exists()); + let dst_nodes_file = &dst_tax_dir.join("nodes.dmp"); + if !dst_nodes_file.exists() { + std::fs::copy(source_nodes_file, dst_nodes_file)?; + } + + let library_fna_path = database.join("library.fna"); + let seqid2taxid_path = database.join("seqid2taxid.map"); + if library_fna_path.exists() && seqid2taxid_path.exists() { + println!("library.fna and seqid2taxid.map exists!"); + return Ok(()); + } + + if library_fna_path.exists() { + std::fs::remove_file(library_fna_path)?; + } + if seqid2taxid_path.exists() { + std::fs::remove_file(seqid2taxid_path)?; + } + let assembly_files = find_files(&download_dir, &PREFIX, &SUFFIX); + + merge_fna(&assembly_files, &args.database)?; + + Ok(()) +} + +#[allow(dead_code)] +fn main() { + let args = Args::parse(); + if let Err(e) = run(args) { + eprintln!("Application error: {}", e); + } +} diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index e276407..89b46cd 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -37,7 +37,7 @@ pub struct Args { #[clap(short = 'S', long = "single-file-pairs", action)] pub single_file_pairs: bool, - /// Minimum quality score for FASTQ data, default is 0. + /// Minimum quality score for FASTQ data. #[clap( short = 'Q', long = "minimum-quality-score", @@ -46,9 +46,9 @@ pub struct Args { )] pub minimum_quality_score: i32, - /// The number of threads to use, default is 10. - #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = 10)] - pub num_threads: i32, + /// The number of threads to use. + #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = num_cpus::get())] + pub num_threads: usize, /// chunk directory #[clap(long)] diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index e480f1f..6bf80f2 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -249,7 +249,7 @@ pub fn convert_fna_to_k2_format>( hash_config: HashConfig, writers: &mut Vec>, chunk_size: usize, - threads: u32, + threads: usize, ) { let mut reader = FastaReader::from_path(fna_file, 1).unwrap(); let value_bits = hash_config.value_bits; @@ -257,7 +257,7 @@ pub fn convert_fna_to_k2_format>( read_parallel( &mut reader, - threads as usize, + threads, &meros, |seqs| { let mut k2_cell_list = Vec::new(); diff --git a/kr2r/src/utils.rs b/kr2r/src/utils.rs index a32a686..207bd3c 100644 --- a/kr2r/src/utils.rs +++ b/kr2r/src/utils.rs @@ -69,15 +69,25 @@ pub fn expand_spaced_seed_mask(spaced_seed_mask: u64, bit_expansion_factor: u64) new_mask } -pub fn find_library_fna_files>(path: P) -> Vec { +pub fn find_files>(path: P, prefix: &str, suffix: &str) -> Vec { WalkDir::new(path) .into_iter() .filter_map(|e| e.ok()) - .filter(|e| e.path().file_name() == Some("library.fna".as_ref())) - .map(|e| e.path().to_string_lossy().into_owned()) + .filter(|e| { + e.path() + .file_name() + .and_then(|name| name.to_str()) + .map(|name| name.starts_with(prefix) && name.ends_with(suffix)) + .unwrap_or(false) + }) + .map(|e| e.path().to_path_buf()) .collect() } +pub fn find_library_fna_files>(path: P) -> Vec { + find_files(path, "library", ".fna") +} + pub fn summary_prelim_map_files>(data_dir: P) -> Result { let lib_path = data_dir.as_ref().join("library"); diff --git a/ncbi/Cargo.toml b/ncbi/Cargo.toml index 1c67f49..f7532e5 100644 --- a/ncbi/Cargo.toml +++ b/ncbi/Cargo.toml @@ -22,3 +22,4 @@ env_logger = "0.11.0" md-5 = "0.10.6" async-compression = "0.4.5" tar = "0.4" +num_cpus = "1.13.1" diff --git a/ncbi/src/fna.rs b/ncbi/src/fna.rs index b78bb19..1d9c5f4 100644 --- a/ncbi/src/fna.rs +++ b/ncbi/src/fna.rs @@ -15,6 +15,7 @@ use tar::Archive; pub async fn decompress_and_extract_tar_gz( gz_path: &PathBuf, out_path: &PathBuf, + files_to_extract: Vec, ) -> std::io::Result<()> { // Open the .tar.gz file let file = File::open(gz_path).await?; @@ -32,7 +33,26 @@ pub async fn decompress_and_extract_tar_gz( // Use the tar crate to decompress the TAR archive let mut archive = Archive::new(&decompressed_data[..]); - archive.unpack(out_path)?; + // archive.unpack(out_path)?; + + // 遍历 TAR 归档中的每个条目 + for entry in archive.entries()? { + let mut entry = entry?; + let path = entry.path()?.to_string_lossy().to_string(); + + // 检查是否为需要提取的文件 + if files_to_extract.contains(&path) { + let out_file_path = out_path.join(&path); + + // 创建输出文件夹 + if let Some(parent) = out_file_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + // 解压缩并写入文件 + entry.unpack(out_file_path)?; + } + } Ok(()) } diff --git a/ncbi/src/load.rs b/ncbi/src/load.rs index 9d65ca4..4656cca 100644 --- a/ncbi/src/load.rs +++ b/ncbi/src/load.rs @@ -101,7 +101,9 @@ impl NcbiFile { NcbiFile::Summary(_) => {} NcbiFile::Genomic(_, _) => {} NcbiFile::Taxonomy(dt1, _) => { - let _ = decompress_and_extract_tar_gz(&dt1.file, &data_dir).await; + let taxo_files: Vec = + vec!["names.dmp".to_string(), "nodes.dmp".to_string()]; + decompress_and_extract_tar_gz(&dt1.file, &data_dir, taxo_files).await?; } } Ok(()) diff --git a/ncbi/src/main.rs b/ncbi/src/main.rs index e1b15b5..fe7fd99 100644 --- a/ncbi/src/main.rs +++ b/ncbi/src/main.rs @@ -52,11 +52,11 @@ fn validate_group(group: &str) -> Result { #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] enum Site { - /// 下载 genbank 资源 + /// Download genbank resources Genbank, - /// 下载 refseq 资源 + /// Download refseq resources Refseq, - /// genbank and refseq + /// Both genbank and refseq All, } @@ -76,18 +76,18 @@ impl fmt::Display for Site { #[derive(Subcommand, Debug)] enum Mode { - /// 仅检查文件的 md5 + /// Check the md5 of files only Md5, - /// 解析 genomic 文件,并且生成 library fna 文件 - /// 同时将单个fna文件拼接成group为组的总的fna格式文件, 以便于构建database + /// Parse genomic files and generate a library fna file + /// Also concatenate individual fna files into a group for building a database Fna { - /// library fna 文件存储目录,为了不和原始文件混淆 + /// Directory to store the library fna file to avoid mixing with original files #[clap(value_parser)] out_dir: Option, }, - /// 仅下载和解析 assembly 文件 + /// Download and parse assembly files only Assembly, - /// 单独下载 genomic 文件,指定 url 地址 + /// Download genomic files separately by specifying a URL Url { #[clap(value_parser)] url: String, @@ -98,15 +98,15 @@ enum Mode { #[clap( version, about = "ncbi download resource", - long_about = "从 ncbi 网站上下载 genomes 资源" + long_about = "Download genomes resources from the NCBI website" )] struct Args { - /// 构建数据库的目录 - #[arg(short, long = "db", default_value = "lib")] - database: PathBuf, + /// Directory to store downloaded files + #[arg(short, long, default_value = "lib")] + download_dir: PathBuf, - /// 下载时的并行大小 - #[arg(short, long, default_value = "8")] + /// Number of threads to use for downloading + #[arg(short, long, default_value_t = num_cpus::get() * 2)] num_threads: usize, #[command(subcommand)] @@ -115,14 +115,14 @@ struct Args { #[derive(Subcommand, Debug)] enum Commands { - /// 从 NCBI 下载 taxonomy 文件 (alias: tax) + /// Download taxonomy files from NCBI (alias: tax) #[command(alias = "tax")] Taxonomy, - /// 从 NCBI 下载 genomes 数据 (alias: gen) + /// Download genomes data from NCBI (alias: gen) #[command(alias = "gen")] Genomes { - /// 从 NCBI 哪个站点目录下载(RefSeq或GenBank) + /// Site directory to download from NCBI (RefSeq or GenBank) #[arg(long, value_enum, default_value_t = Site::Refseq)] site: Site, @@ -131,18 +131,19 @@ enum Commands { #[arg(long, default_value = "basic")] asm_level: String, - /// 从 NCBI 站点上下载某个种类的数据信息,可以是逗号分隔的多个, archaea,bacteria,viral,fungi,plant,human,protozoa,vertebrate_mammalian,vertebrate_other,invertebrate + /// Type of data to download from NCBI site, can be multiple comma-separated values + /// e.g., archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other, invertebrate #[arg(short, long, value_parser = validate_group)] group: String, - /// 子命令,使用 md5 校验和生成 fna 文件 + /// Subcommand to generate fna files using md5 checksum #[command(subcommand)] mode: Option, }, } async fn async_run(args: Args) -> Result<()> { - let db_path = utils::create_data_dir(&args.database).unwrap(); + let db_path = utils::create_data_dir(&args.download_dir).unwrap(); init_meta(&db_path).await; match args.command { @@ -245,12 +246,12 @@ async fn async_run(args: Args) -> Result<()> { }, Some(Mode::Url { url }) => { if site == Site::All { - log::error!("必须指定合适的site"); + log::error!("Must specify a suitable site"); } else { let result = task::run_download_file(&site.to_string(), &data_dir, &url).await; if result.is_err() { - log::error!("下载文件失败... {:?}", result); + log::error!("download error... {:?}", result); } } } diff --git a/ncbi/src/task.rs b/ncbi/src/task.rs index 10cf947..97538b2 100644 --- a/ncbi/src/task.rs +++ b/ncbi/src/task.rs @@ -157,14 +157,14 @@ pub async fn run_taxo(taxo_dir: &PathBuf) -> Result<()> { log::info!("download taxonomy..."); let files = [ "taxdump.tar.gz", - "accession2taxid/nucl_gb.accession2taxid.gz", - "accession2taxid/nucl_wgs.accession2taxid.gz", + // "accession2taxid/nucl_gb.accession2taxid.gz", + // "accession2taxid/nucl_wgs.accession2taxid.gz", ]; for url_path in files.iter() { let ncbi_file = NcbiFile::new_taxo(taxo_dir, &url_path).await; let result = ncbi_file.run().await; if result.is_ok() && url_path.to_string() == "taxdump.tar.gz" { - let _ = ncbi_file.decompress(taxo_dir).await; + ncbi_file.decompress(taxo_dir).await?; } } log::info!("download taxonomy finished..."); From d1ade229d923ff90371ac7973258e0f13c637cc5 Mon Sep 17 00:00:00 2001 From: dagou Date: Sat, 22 Jun 2024 21:51:04 +0800 Subject: [PATCH 15/18] merge fna --- kr2r/src/bin/build_k2_db.rs | 2 +- kr2r/src/bin/kun.rs | 2 +- kr2r/src/bin/merge_fna.rs | 10 ++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs index 84c9886..f0c9d7b 100644 --- a/kr2r/src/bin/build_k2_db.rs +++ b/kr2r/src/bin/build_k2_db.rs @@ -48,7 +48,7 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box, /// Directory to store downloaded files - #[arg(short, long, default_value = "lib")] + #[arg(short, long, required = true)] pub download_dir: PathBuf, // chunk_dir: PathBuf, diff --git a/kr2r/src/bin/merge_fna.rs b/kr2r/src/bin/merge_fna.rs index f111879..089d81b 100644 --- a/kr2r/src/bin/merge_fna.rs +++ b/kr2r/src/bin/merge_fna.rs @@ -1,12 +1,12 @@ use clap::Parser; +use flate2::read::GzDecoder; use kr2r::utils::{find_files, open_file}; use std::collections::HashMap; use std::fs::{create_dir_all, File, OpenOptions}; use std::io::{BufRead, BufReader, BufWriter, Result, Write}; use std::path::PathBuf; - -use flate2::read::GzDecoder; +use std::time::Instant; #[derive(Parser, Debug, Clone)] #[clap(version, about = "A tool for processing genomic files")] @@ -170,6 +170,9 @@ fn merge_fna(assembly_files: &Vec, database: &PathBuf) -> Result<()> { } pub fn run(args: Args) -> Result<()> { + // 开始计时 + let start = Instant::now(); + println!("merge fna start..."); let download_dir = args.download_dir; let database = &args.database; @@ -207,6 +210,9 @@ pub fn run(args: Args) -> Result<()> { merge_fna(&assembly_files, &args.database)?; + // 计算持续时间 + let duration = start.elapsed(); + println!("merge fna took: {:?}", duration); Ok(()) } From 38fb6df4dc4f3a2a3680c91f91b78a14f8f4ad62 Mon Sep 17 00:00:00 2001 From: dagou Date: Sun, 23 Jun 2024 11:34:55 +0800 Subject: [PATCH 16/18] examples --- .gitignore | 2 + kr2r/examples/build_and_classify.rs | 82 +++++++++++++++++++++++++++++ kr2r/src/args.rs | 14 ++--- kr2r/src/bin/annotate.rs | 8 +-- kr2r/src/bin/build_k2_db.rs | 13 ++--- kr2r/src/bin/classify.rs | 12 ++--- kr2r/src/bin/hashshard.rs | 9 ++-- kr2r/src/bin/kun.rs | 25 +++++---- kr2r/src/bin/resolve.rs | 8 +-- kr2r/src/bin/splitr.rs | 8 +-- kr2r/src/utils.rs | 2 +- ncbi/examples/run_download.rs | 56 ++++++++++++++++++++ 12 files changed, 189 insertions(+), 50 deletions(-) create mode 100644 kr2r/examples/build_and_classify.rs create mode 100644 ncbi/examples/run_download.rs diff --git a/.gitignore b/.gitignore index c277cc9..263cc94 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ Cargo.lock lib/ out_dir/ slurm.sh +downloads/ +test_database/ diff --git a/kr2r/examples/build_and_classify.rs b/kr2r/examples/build_and_classify.rs new file mode 100644 index 0000000..b3cbd13 --- /dev/null +++ b/kr2r/examples/build_and_classify.rs @@ -0,0 +1,82 @@ +use std::fs; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + // Define the paths and directories + let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .to_path_buf(); + let kr2r_binary = workspace_root.join("target/release/kun_peng"); + let data_dir = workspace_root.join("data"); + let test_dir = workspace_root.join("test_database"); + + // Ensure the necessary directories exist + fs::create_dir_all(&data_dir).expect("Failed to create download directory"); + fs::create_dir_all(&test_dir).expect("Failed to create database directory"); + + // Command 1: ./target/release/kun_peng build --download-dir data/ --db test_database + let build_args = vec![ + "build".to_string(), + "--download-dir".to_string(), + data_dir.to_string_lossy().to_string(), + "--db".to_string(), + test_dir.to_string_lossy().to_string(), + ]; + + let build_command_str = format!("{} {}", kr2r_binary.to_string_lossy(), build_args.join(" ")); + println!("Executing command: {}", build_command_str); + + let build_output = Command::new(&kr2r_binary) + .args(&build_args) + .output() + .expect("Failed to run kun_peng build command"); + println!( + "kun_peng build output: {}", + String::from_utf8_lossy(&build_output.stdout) + ); + if !build_output.stderr.is_empty() { + println!( + "kun_peng build error: {}", + String::from_utf8_lossy(&build_output.stderr) + ); + } + + // Command 2: ./target/release/kun_peng direct --db test_database data/COVID_19.fa + let covid_fa = data_dir.join("COVID_19.fa"); + if !covid_fa.exists() { + println!( + "kun_peng error: fasta file {} does not exists", + covid_fa.to_string_lossy().to_string() + ); + } + let direct_args = vec![ + "direct".to_string(), + "--db".to_string(), + test_dir.to_string_lossy().to_string(), + covid_fa.to_string_lossy().to_string(), + ]; + + let direct_command_str = format!( + "{} {}", + kr2r_binary.to_string_lossy(), + direct_args.join(" ") + ); + println!("Executing command: {}", direct_command_str); + + let direct_output = Command::new(&kr2r_binary) + .args(&direct_args) + .output() + .expect("Failed to run kun_peng direct command"); + println!( + "kun_peng direct output: {}", + String::from_utf8_lossy(&direct_output.stdout) + ); + if !direct_output.stderr.is_empty() { + println!( + "kun_peng direct error: {}", + String::from_utf8_lossy(&direct_output.stderr) + ); + } +} diff --git a/kr2r/src/args.rs b/kr2r/src/args.rs index 059dad8..2a55865 100644 --- a/kr2r/src/args.rs +++ b/kr2r/src/args.rs @@ -51,18 +51,14 @@ const BATCH_SIZE: usize = 16 * 1024 * 1024; long_about = "classify a set of sequences" )] pub struct ClassifyArgs { - /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + // /// database hash chunk directory and other files + #[arg(long = "db", required = true)] + pub database: PathBuf, /// chunk directory #[clap(long)] pub chunk_dir: PathBuf, - /// Enables use of a Kraken 2 compatible shared database. - #[clap(long, default_value_t = false)] - pub kraken_db_type: bool, - /// File path for outputting normal Kraken output. #[clap(long = "output-dir", value_parser)] pub kraken_output_dir: Option, @@ -110,6 +106,10 @@ pub struct ClassifyArgs { )] pub minimum_hit_groups: usize, + /// Enables use of a Kraken 2 compatible shared database. + #[clap(long, default_value_t = false)] + pub kraken_db_type: bool, + /// In comb. w/ -R, provide minimizer information in report #[clap(short = 'K', long, value_parser, default_value_t = false)] pub report_kmer_data: bool, diff --git a/kr2r/src/bin/annotate.rs b/kr2r/src/bin/annotate.rs index b685872..5fe5bb4 100644 --- a/kr2r/src/bin/annotate.rs +++ b/kr2r/src/bin/annotate.rs @@ -25,8 +25,8 @@ pub const BATCH_SIZE: usize = 8 * 1024 * 1024; )] pub struct Args { /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + #[arg(long = "db", required = true)] + pub database: PathBuf, /// Enables use of a Kraken 2 compatible shared database. Default is false. #[clap(long, default_value_t = false)] @@ -200,7 +200,7 @@ fn process_chunk_file>( let start = Instant::now(); - let config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; let parition = hash_files.len(); let chtm = if args.kraken_db_type { CHTable::from_pair( @@ -231,7 +231,7 @@ fn process_chunk_file>( pub fn run(args: Args) -> Result<()> { let chunk_files = find_and_sort_files(&args.chunk_dir, "sample", ".k2")?; - let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?; + let hash_files = find_and_sort_files(&args.database, "hash", ".k2d")?; // 开始计时 let start = Instant::now(); diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs index f0c9d7b..f231bd5 100644 --- a/kr2r/src/bin/build_k2_db.rs +++ b/kr2r/src/bin/build_k2_db.rs @@ -12,16 +12,14 @@ use kr2r::utils::{ }; use kr2r::IndexOptions; use std::fs::remove_file; -use std::path::PathBuf; use std::time::Instant; #[derive(Parser, Debug, Clone)] -#[clap(author, version, about="build `k2d` files", long_about = None)] +#[clap(author, version, about="build database", long_about = None)] pub struct Args { - /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: Option, - + // /// database hash chunk directory and other files + // #[clap(long)] + // pub k2d_dir: Option, #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")] pub hash_capacity: usize, @@ -43,8 +41,7 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box Result<()> { - let options_filename = &args.k2d_dir.join("opts.k2d"); + let options_filename = &args.database.join("opts.k2d"); let idx_opts = IndexOptions::read_index_options(options_filename)?; if args.paired_end_processing && !args.single_file_pairs && args.input_files.len() % 2 != 0 { @@ -331,10 +331,10 @@ pub fn run(args: Args) -> Result<()> { )); } - let taxonomy_filename = args.k2d_dir.join("taxo.k2d"); + let taxonomy_filename = args.database.join("taxo.k2d"); let taxo = Taxonomy::from_file(taxonomy_filename)?; - let hash_config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; println!("{:?}", hash_config); if hash_config.hash_capacity == 0 { @@ -343,7 +343,7 @@ pub fn run(args: Args) -> Result<()> { println!("classify start..."); let start = Instant::now(); let meros = idx_opts.as_meros(); - let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?; + let hash_files = find_and_sort_files(&args.database, "hash", ".k2d")?; let chtable = CHTable::from_hash_files(hash_config, hash_files)?; process_files(args, meros, hash_config, &chtable, &taxo)?; diff --git a/kr2r/src/bin/hashshard.rs b/kr2r/src/bin/hashshard.rs index e5b5256..64150fe 100644 --- a/kr2r/src/bin/hashshard.rs +++ b/kr2r/src/bin/hashshard.rs @@ -53,10 +53,9 @@ pub struct Args { #[clap(long = "db", value_parser, required = true)] database: PathBuf, - /// database hash chunk directory and other files - #[clap(long)] - k2d_dir: Option, - + // /// database hash chunk directory and other files + // #[clap(long)] + // k2d_dir: Option, /// Specifies the hash file capacity. Acceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K'). /// Note: The specified capacity affects the index size, with a factor of 4 applied. For example, specifying '1G' results in an index size of '4G'. /// Default: 1G (capacity 1G = file size 4G) @@ -76,7 +75,7 @@ pub fn run(args: Args) -> IOResult<()> { let file_len = hash_config.capacity * 4 + 32; let b_size = std::mem::size_of::(); - let k2d_dir = args.k2d_dir.unwrap_or(args.database.clone()); + let k2d_dir = args.database.clone(); create_dir_all(&k2d_dir).expect(&format!("create hash dir error {:?}", k2d_dir)); diff --git a/kr2r/src/bin/kun.rs b/kr2r/src/bin/kun.rs index 37c24d0..65b097f 100644 --- a/kr2r/src/bin/kun.rs +++ b/kr2r/src/bin/kun.rs @@ -17,12 +17,11 @@ use std::path::PathBuf; use std::time::Instant; #[derive(Parser, Debug, Clone)] -#[clap(author, version, about="build `k2d` files", long_about = None)] +#[clap(author, version, about="build database", long_about = None)] struct BuildArgs { - /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: Option, - + // /// database hash chunk directory and other files + // #[clap(long)] + // pub k2d_dir: Option, /// Directory to store downloaded files #[arg(short, long, required = true)] pub download_dir: PathBuf, @@ -60,7 +59,7 @@ struct Args { impl From for splitr::Args { fn from(item: ClassifyArgs) -> Self { Self { - k2d_dir: item.k2d_dir, + database: item.database, paired_end_processing: item.paired_end_processing, single_file_pairs: item.single_file_pairs, minimum_quality_score: item.minimum_quality_score, @@ -74,7 +73,7 @@ impl From for splitr::Args { impl From for annotate::Args { fn from(item: ClassifyArgs) -> Self { Self { - k2d_dir: item.k2d_dir, + database: item.database, chunk_dir: item.chunk_dir, batch_size: item.batch_size, kraken_db_type: item.kraken_db_type, @@ -85,7 +84,7 @@ impl From for annotate::Args { impl From for resolve::Args { fn from(item: ClassifyArgs) -> Self { Self { - k2d_dir: item.k2d_dir, + database: item.database, chunk_dir: item.chunk_dir, batch_size: item.batch_size, confidence_threshold: item.confidence_threshold, @@ -115,7 +114,6 @@ impl From for build_k2_db::Args { fn from(item: BuildArgs) -> Self { Self { build: item.build, - k2d_dir: item.k2d_dir, hash_capacity: parse_size("1G").unwrap(), } } @@ -192,10 +190,15 @@ fn main() -> Result<(), Box> { let splitr_args = splitr::Args::from(cmd_args.clone()); let chunk_files = find_and_sort_files(&splitr_args.chunk_dir, "sample", ".k2")?; - if !chunk_files.is_empty() { + let sample_files = find_and_sort_files(&splitr_args.chunk_dir, "sample", ".map")?; + let bin_files = find_and_sort_files(&splitr_args.chunk_dir, "sample", ".map")?; + if !chunk_files.is_empty() || !sample_files.is_empty() || !bin_files.is_empty() { return Err(Box::new(std::io::Error::new( std::io::ErrorKind::Other, - format!("{} must be empty", &splitr_args.chunk_dir.display()), + format!( + "{} `sample` files must be empty", + &splitr_args.chunk_dir.display() + ), ))); } splitr::run(splitr_args)?; diff --git a/kr2r/src/bin/resolve.rs b/kr2r/src/bin/resolve.rs index 9653d21..2a86376 100644 --- a/kr2r/src/bin/resolve.rs +++ b/kr2r/src/bin/resolve.rs @@ -58,8 +58,8 @@ pub fn read_id_to_seq_map>( )] pub struct Args { /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + #[arg(long = "db", required = true)] + pub database: PathBuf, /// chunk directory #[clap(long, value_parser, required = true)] @@ -206,7 +206,7 @@ fn process_batch>( } pub fn run(args: Args) -> Result<()> { - let k2d_dir = &args.k2d_dir; + let k2d_dir = &args.database; let taxonomy_filename = k2d_dir.join("taxo.k2d"); let taxo = Taxonomy::from_file(taxonomy_filename)?; @@ -214,7 +214,7 @@ pub fn run(args: Args) -> Result<()> { let sample_id_files = find_and_sort_files(&args.chunk_dir, "sample_id", ".map")?; let partition = sample_files.len(); - let hash_config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; let value_mask = hash_config.value_mask; let mut total_taxon_counts = TaxonCounters::new(); diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index 89b46cd..cdf2462 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -23,8 +23,8 @@ use std::time::Instant; )] pub struct Args { /// database hash chunk directory and other files - #[clap(long)] - pub k2d_dir: PathBuf, + #[arg(long = "db", required = true)] + pub database: PathBuf, // /// The file path for the Kraken 2 options. // #[clap(short = 'o', long = "options-filename", value_parser, required = true)] @@ -251,7 +251,7 @@ fn convert(args: Args, meros: Meros, hash_config: HashConfig) -> Result<()> { pub fn run(args: Args) -> Result<()> { // let args = Args::parse(); - let options_filename = &args.k2d_dir.join("opts.k2d"); + let options_filename = &args.database.join("opts.k2d"); let idx_opts = IndexOptions::read_index_options(options_filename)?; if args.paired_end_processing && !args.single_file_pairs && args.input_files.len() % 2 != 0 { @@ -261,7 +261,7 @@ pub fn run(args: Args) -> Result<()> { "Paired-end processing requires an even number of input files.", )); } - let hash_config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?; + let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; println!("hash_config {:?}", hash_config); if hash_config.hash_capacity == 0 { diff --git a/kr2r/src/utils.rs b/kr2r/src/utils.rs index 207bd3c..b7fa650 100644 --- a/kr2r/src/utils.rs +++ b/kr2r/src/utils.rs @@ -263,7 +263,7 @@ pub fn find_and_sort_files( if a_idx as i32 != *num { return Err(io::Error::new( io::ErrorKind::NotFound, - "File numbers are not continuous starting from 0.", + "File numbers are not continuous starting from 1.", )); } } diff --git a/ncbi/examples/run_download.rs b/ncbi/examples/run_download.rs new file mode 100644 index 0000000..cadaf6f --- /dev/null +++ b/ncbi/examples/run_download.rs @@ -0,0 +1,56 @@ +use std::fs; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .to_path_buf(); + + // Run the NCBI binary to download files + let ncbi_binary = workspace_root.join("target/release/ncbi"); + let download_dir = workspace_root.join("downloads"); + // Ensure the download directory exists + fs::create_dir_all(&download_dir).expect("Failed to create download directory"); + + let args = vec![ + "-d".to_string(), + download_dir.to_string_lossy().to_string(), + "gen".to_string(), + "-g".to_string(), + "archaea".to_string(), + ]; + + let command_str = format!("{} {}", ncbi_binary.to_string_lossy(), args.join(" ")); + println!("Executing command: {}", command_str); + + // Run the NCBI binary to download files + let output = Command::new(&ncbi_binary) + .args(&args) + .output() + .expect("Failed to run NCBI binary"); + println!( + "NCBI binary output: {}", + String::from_utf8_lossy(&output.stdout) + ); + + let args = vec![ + "-d".to_string(), + download_dir.to_string_lossy().to_string(), + "tax".to_string(), + ]; + + let command_str = format!("{} {}", ncbi_binary.to_string_lossy(), args.join(" ")); + println!("Executing command: {}", command_str); + + // Run the NCBI binary to download files + let output = Command::new(&ncbi_binary) + .args(&args) + .output() + .expect("Failed to run NCBI binary"); + println!( + "NCBI binary output: {}", + String::from_utf8_lossy(&output.stdout) + ); +} From fe7ca864f1e221d93b9e5201271558cfdf2dbd04 Mon Sep 17 00:00:00 2001 From: dagou Date: Sun, 23 Jun 2024 11:35:10 +0800 Subject: [PATCH 17/18] examples data --- data/COVID_19.fa | 375 +++++++++++++++++ data/FluA_H1N1.fa | 182 +++++++++ data/FluA_H2N2.fa | 180 +++++++++ data/FluA_H3N2.fa | 183 +++++++++ data/FluB.fa | 192 +++++++++ data/HIV_1.fna | 116 ++++++ data/MERS.fa | 378 ++++++++++++++++++ .../library/viral/assembly_summary_refseq.txt | 3 + ..._000864765.1_ViralProj15476_genomic.fna.gz | Bin 0 -> 3052 bytes ...25.1_ViralMultiSegProj15521_genomic.fna.gz | Bin 0 -> 4629 bytes ...GCF_009858895.2_ASM985889v3_genomic.fna.gz | Bin 0 -> 9591 bytes data/taxonomy/names.dmp | 39 ++ data/taxonomy/nodes.dmp | 39 ++ 13 files changed, 1687 insertions(+) create mode 100644 data/COVID_19.fa create mode 100644 data/FluA_H1N1.fa create mode 100644 data/FluA_H2N2.fa create mode 100644 data/FluA_H3N2.fa create mode 100644 data/FluB.fa create mode 100644 data/HIV_1.fna create mode 100644 data/MERS.fa create mode 100644 data/library/viral/assembly_summary_refseq.txt create mode 100644 data/library/viral/refseq/GCF_000864765.1_ViralProj15476_genomic.fna.gz create mode 100644 data/library/viral/refseq/GCF_000865725.1_ViralMultiSegProj15521_genomic.fna.gz create mode 100644 data/library/viral/refseq/GCF_009858895.2_ASM985889v3_genomic.fna.gz create mode 100644 data/taxonomy/names.dmp create mode 100644 data/taxonomy/nodes.dmp diff --git a/data/COVID_19.fa b/data/COVID_19.fa new file mode 100644 index 0000000..8fa02d5 --- /dev/null +++ b/data/COVID_19.fa @@ -0,0 +1,375 @@ +>kraken:taxid|2697049|NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAA +AATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGG +ACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT +CGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGC +CTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACAT +CTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAA +ACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC +GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAG +AACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGA +TCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACG +GAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTA +GCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCG +TGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAAT +TGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA +CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATG +CAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAG +CCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTT +GTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGG +CTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACA +AGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGT +CTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA +GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCAT +TCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAA +CAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCT +TGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGA +GACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTT +GTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGA +AGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA +TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTT +TTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCA +CTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAA +TTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCA +TTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGA +AATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTG +CACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT +GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGC +CTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACCACTGGGCATTGATTTAGATG +AGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTAC +CCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGA +AGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATT +GGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATT +GTTGAGGTTCAACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT +AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACAGTGGTTGTTA +ATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTT +GAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAA +ACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTA +ATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGT +GTAGATACTGTTCGCACAAATGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGA +AATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA +AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAA +ACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAG +TGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCAAGAGGGTGTTTTAACTGCTG +TGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAATGCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTAT +ATAACCACTTACCCGGGTCAGGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGC +CTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGC +TTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA +TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGC +GTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTATGTAACACATGGCTTAAATT +TGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCG +TATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAA +AGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTA +GTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTG +AGGACTATTAAGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA +ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGTAAAACATTTT +ATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGG +TACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAA +CAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTT +ATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTA +GGTGATGTTAGAGAAACAATGAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTG +TAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT +TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTT +ATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCA +GTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTTACTTACAAAGTCCTCAGAAT +ACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGTTACACAACAACCATAAAACCAGTTACTTATAAATTGGAT +GGTGTTGTTTGTACAGAAATTGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAAT +TGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTG +ATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT +GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTG +GCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAG +TTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAA +CCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGT +AGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTT +ATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCAT +GGTTTAGCTGCTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC +AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTATTGCTACAAT +TGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGT +GTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTG +GTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCA +TGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCT +ATACCTTGTAGTGTTTGTCTTAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTC +ATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT +ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATG +TGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTA +TGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTGTTACAAACGTAATAGAGCAA +CAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAGGTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGC +AAACTACACAATTGGAATTGTGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGA +CTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATG +GTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC +AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATC +ATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACTAGATCAGGCATTAGTGTCTG +ATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCA +ATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTAC +TTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCAC +ATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCC +CGTGACCTTGGTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT +ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAGAATAACTTAC +CTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATT +GTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACC +TGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTG +ACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTAT +ACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCAC +GATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC +CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCT +TCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACAC +ACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTCTGTTAGAGTGGTAACAACTT +TTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATCAGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTA +CTTAACAATGATTATTACAGATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTAC +ACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACAT +GCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC +CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTT +GACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTTCACACCTTTAGTACCTTTCT +GGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTA +GTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAA +GTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTG +GAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGT +TCTGATGTTCTTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC +ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTTGATGACGTAG +TTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCT +AATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCT +TAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAG +CTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAAT +GGTTCATGTGGTAGTGTTGGTTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAAC +TGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA +CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGA +TTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACAT +ACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAAAGAATTACTGCAAAATGGTA +TGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGATGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGT +GTTACTTTCCAAAGTGCAGTGAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTT +AGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTA +TTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC +ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATAC +TAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACTAATCCTTATGACAGCAAGAA +CTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAAT +GCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCAT +GTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTA +TAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGA +CTGACTCTTGGTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA +GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTAGCCACTGTAC +AGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCT +AAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGT +TTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGG +CAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAG +CAGGCTGTTGCTAATGGTGATTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGA +CCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG +AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTC +AACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGT +TGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGCATCAGCATTGTGGGAAATCC +AACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAGTGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCT +CTTATTGTAACAGCTTTAAGGGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGAT +GTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTA +GGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC +TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAA +AGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCTACAAGCTGGTAATGCAACAG +AAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCT +AGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACC +GGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATC +CTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTT +AAAAACACAGTCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA +GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAG +TACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTT +GTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTAC +CAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGA +CGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATT +TTGATGAAGGTAATTGTGACACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAG +GACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT +AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATG +GTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTG +TTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGACTTAACAAAGCCTTACATTAA +GTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACAT +ACCACCCAAATTGTGTTAACTGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTG +TTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCA +CTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT +ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCA +CTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTATGACTTTGCTGTGTCTAAGGG +TTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATG +ACTACTATCGTTATAATCTACCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTT +GATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAA +TAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTA +ATGTCATCCCTACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC +TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAGGAGCTACTGT +AGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACC +TTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGC +AAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGT +CATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTT +TTAACATTTGTCAAGCTGTCACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTC +CGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC +ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAG +GTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGG +ACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTTAAACAGGGTGATGATTATGT +GTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCCGGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACAC +TTATGATTGAACGGTTCGTGTCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTC +TTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCT +TACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG +TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAA +TGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTATGTTTGCAATGCTCCAGGTTG +TGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTC +CATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCA +ATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGC +AGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAG +AATTACATCTTTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT +AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACCGAGGTACAAC +AACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGC +CACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAAT +TATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCT +AGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCAT +TAAAATATTTGCCTATAGATAAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTG +AATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA +AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACC +CTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTT +ATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTT +GGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCAGCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGC +ATGATGTTTCATCTGCAATTAACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAA +GCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTC +ATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA +ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACA +AGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTCTTTAAAGATTGTAGTAAGGT +AATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTG +ACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAAT +GGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTG +TCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTAC +CTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA +CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACT +TAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAA +TAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGG +CATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACA +AAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAG +CTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCG +GCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA +CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTT +ATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGC +AATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTG +TGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAAC +AATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCA +CTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTA +TCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA +ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGT +GAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAAC +AACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATA +ATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGT +GTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGT +TGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCAT +CTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG +AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAG +TCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAAC +ATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAA +TCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTC +ATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAG +TTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACA +TTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT +ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATA +CTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCT +GATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAA +TGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTA +TTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGT +GGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTA +TAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG +GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACA +AATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTC +TTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAG +TTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAG +TCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTG +ACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCAT +GCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC +TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTG +TTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCAC +AAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCA +GCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTT +ATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTG +GTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGA +TTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA +ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTC +ACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTAC +AAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCA +ACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAA +TTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGG +GCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACA +ATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA +GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACA +ATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATG +CACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTA +ACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTAC +TGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACAC +CAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCA +GATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC +TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATT +CTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCT +TACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAA +GACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTT +GTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAA +CAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAG +CAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT +GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACA +GATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGC +ATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACC +AAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGA +AAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAAT +TTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCA +GACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT +ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCC +TCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTG +CCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAA +AGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGT +CAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATA +CATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTC +AATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC +ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTT +GCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAA +GGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAG +CAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTT +CGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGC +AACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTC +GTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT +AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGC +ATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACA +AGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATT +ACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCT +TCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTT +AATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGA +ACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT +TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTA +AAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTA +AATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCT +TAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATG +CCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTG +CTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCT +CAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC +TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGT +GGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATC +ACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACA +GGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACA +GATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTT +GGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGAT +GAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA +GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCA +ATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGAC +GGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACT +TTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGAT +TGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTT +GGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATC +ATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC +GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGG +ATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTTACCTTTTACAATTAAT +TGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGT +TGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTAC +GTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCC +AAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGA +CAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG +TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCT +ATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGC +AATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAG +CAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTA +GGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAG +CTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAA +GAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC +AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCC +AGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGC +CATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACAT +TCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAG +CAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTC +AACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATA +GTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT +TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGT +ACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAATTTTAGTAGTGCTATCCCCAT +GTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA diff --git a/data/FluA_H1N1.fa b/data/FluA_H1N1.fa new file mode 100644 index 0000000..119b0d2 --- /dev/null +++ b/data/FluA_H1N1.fa @@ -0,0 +1,182 @@ +>kraken:taxid|211044|NC_002023.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 1, complete sequence +AGCGAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAACTAAGAAATCTAATGTCGCAGTCTCGCACCCGCGA +GATACTCACAAAAACCACCGTGGACCATATGGCCATAATCAAGAAGTACACATCAGGAAGACAGGAGAAGAACCCAGCAC +TTAGGATGAAATGGATGATGGCAATGAAATATCCAATTACAGCAGACAAGAGGATAACGGAAATGATTCCTGAGAGAAAT +GAGCAAGGACAAACTTTATGGAGTAAAATGAATGATGCCGGATCAGACCGAGTGATGGTATCACCTCTGGCTGTGACATG +GTGGAATAGGAATGGACCAATGACAAATACAGTTCATTATCCAAAAATCTACAAAACTTATTTTGAAAGAGTCGAAAGGC +TAAAGCATGGAACCTTTGGCCCTGTCCATTTTAGAAACCAAGTCAAAATACGTCGGAGAGTTGACATAAATCCTGGTCAT +GCAGATCTCAGTGCCAAGGAGGCACAGGATGTAATCATGGAAGTTGTTTTCCCTAACGAAGTGGGAGCCAGGATACTAAC +ATCGGAATCGCAACTAACGATAACCAAAGAGAAGAAAGAAGAACTCCAGGATTGCAAAATTTCTCCTTTGATGGTTGCAT +ACATGTTGGAGAGAGAACTGGTCCGCAAAACGAGATTCCTCCCAGTGGCTGGTGGAACAAGCAGTGTGTACATTGAAGTG +TTGCATTTGACTCAAGGAACATGCTGGGAACAGATGTATACTCCAGGAGGGGAAGTGAAGAATGATGATGTTGATCAAAG +CTTGATTATTGCTGCTAGGAACATAGTGAGAAGAGCTGCAGTATCAGCAGACCCACTAGCATCTTTATTGGAGATGTGCC +ACAGCACACAGATTGGTGGAATTAGGATGGTAGACATCCTTAAGCAGAACCCAACAGAAGAGCAAGCCGTGGGTATATGC +AAGGCTGCAATGGGACTGAGAATTAGCTCATCCTTCAGTTTTGGTGGATTCACATTTAAGAGAACAAGCGGATCATCAGT +CAAGAGAGAGGAAGAGGTGCTTACGGGCAATCTTCAAACATTGAAGATAAGAGTGCATGAGGGATATGAAGAGTTCACAA +TGGTTGGGAGAAGAGCAACAGCCATACTCAGAAAAGCAACCAGGAGATTGATTCAGCTGATAGTGAGTGGGAGAGACGAA +CAGTCGATTGCCGAAGCAATAATTGTGGCCATGGTATTTTCACAAGAGGATTGTATGATAAAAGCAGTTAGAGGTGATCT +GAATTTCGTCAATAGGGCGAATCAGCGACTGAATCCTATGCATCAACTTTTAAGACATTTTCAGAAGGATGCGAAAGTGC +TTTTTCAAAATTGGGGAGTTGAACCTATCGACAATGTGATGGGAATGATTGGGATATTGCCCGACATGACTCCAAGCATC +GAGATGTCAATGAGAGGAGTGAGAATCAGCAAAATGGGTGTAGATGAGTACTCCAGCACGGAGAGGGTAGTGGTGAGCAT +TGACCGGTTCTTGAGAGTCCGGGACCAACGAGGAAATGTACTACTGTCTCCCGAGGAGGTCAGTGAAACACAGGGAACAG +AGAAACTGACAATAACTTACTCATCGTCAATGATGTGGGAGATTAATGGTCCTGAATCAGTGTTGGTCAATACCTATCAA +TGGATCATCAGAAACTGGGAAACTGTTAAAATTCAGTGGTCCCAGAACCCTACAATGCTATACAATAAAATGGAATTTGA +ACCATTTCAGTCTTTAGTACCTAAGGCCATTAGAGGCCAATACAGTGGGTTTGTGAGAACTCTGTTCCAACAAATGAGGG +ATGTGCTTGGGACATTTGATACCGCACAGATAATAAAACTTCTTCCCTTCGCAGCCGCTCCACCAAAGCAAAGTAGAATG +CAGTTCTCCTCATTTACTGTGAATGTGAGGGGATCAGGAATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTA +CAACAAGGCCACGAAGAGACTCACAGTTCTCGGAAAGGATGCTGGCACTTTAACCGAAGACCCAGATGAAGGCACAGCTG +GAGTGGAGTCCGCTGTTCTGAGGGGATTCCTCATTCTGGGCAAAGAAGACAGGAGATATGGGCCAGCATTAAGCATCAAT +GAACTGAGCAACCTTGCGAAAGGAGAGAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA +ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAGTGTCGAATAGTTT +AAAAACGACCTTGTTTCTACT +>kraken:taxid|211044|NC_002021.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 2, complete sequence +AGCGAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTTTTCTTAAAAGTGCCAGCACAAAATGCTATAAG +CACAACTTTCCCTTATACCGGAGACCCTCCTTACAGCCATGGGACAGGAACAGGATACACCATGGATACTGTCAACAGGA +CACATCAGTACTCAGAAAAGGCAAGATGGACAACAAACACCGAAACTGGAGCACCGCAACTCAACCCGATTGATGGGCCA +CTGCCAGAAGACAATGAACCAAGTGGTTATGCCCAAACAGATTGTGTATTGGAAGCAATGGCTTTCCTTGAGGAATCCCA +TCCTGGTATTTTTGAAAACTCGTGTATTGAAACGATGGAGGTTGTTCAGCAAACACGAGTAGACAAGCTGACACAAGGCC +GACAGACCTATGACTGGACTTTAAATAGAAACCAGCCTGCTGCAACAGCATTGGCCAACACAATAGAAGTGTTCAGATCA +AATGGCCTCACGGCCAATGAGTCTGGAAGGCTCATAGACTTCCTTAAGGATGTAATGGAGTCAATGAAAAAAGAAGAAAT +GGGGATCACAACTCATTTTCAGAGAAAGAGACGGGTGAGAGACAATATGACTAAGAAAATGATAACACAGAGAACAATAG +GTAAAAGGAAACAGAGATTGAACAAAAGGAGTTATCTAATTAGAGCATTGACCCTGAACACAATGACCAAAGATGCTGAG +AGAGGGAAGCTAAAACGGAGAGCAATTGCAACCCCAGGGATGCAAATAAGGGGGTTTGTATACTTTGTTGAGACACTGGC +AAGGAGTATATGTGAGAAACTTGAACAATCAGGGTTGCCAGTTGGAGGCAATGAGAAGAAAGCAAAGTTGGCAAATGTTG +TAAGGAAGATGATGACCAATTCTCAGGACACCGAACTTTCTTTGACCATCACTGGAGATAACACCAAATGGAACGAAAAT +CAGAATCCTCGGATGTTTTTGGCCATGATCACATATATGACCAGAAATCAGCCCGAATGGTTCAGAAATGTTCTAAGTAT +TGCTCCAATAATGTTCTCAAACAAAATGGCGAGACTGGGAAAAGGGTATATGTTTGAGAGCAAGAGTATGAAACTTAGAA +CTCAAATACCTGCAGAAATGCTAGCAAGCATTGATTTGAAATATTTCAATGATTCAACAAGAAAGAAGATTGAAAAAATC +CGACCGCTCTTAATAGAGGGGACTGCATCATTGAGCCCTGGAATGATGATGGGCATGTTCAATATGTTAAGCACTGTATT +AGGCGTCTCCATCCTGAATCTTGGACAAAAGAGATACACCAAGACTACTTACTGGTGGGATGGTCTTCAATCCTCTGACG +ATTTTGCTCTGATTGTGAATGCACCCAATCATGAAGGGATTCAAGCCGGAGTCGACAGGTTTTATCGAACCTGTAAGCTA +CATGGAATCAATATGAGCAAGAAAAAGTCTTACATAAACAGAACAGGTACATTTGAATTCACAAGTTTTTTCTATCGTTA +TGGGTTTGTTGCCAATTTCAGCATGGAGCTTCCCAGTTTTGGTGTGTCTGGGAGCAACGAGTCAGCGGACATGAGTATTG +GAGTTACTGTCATCAAAAACAATATGATAAACAATGATCTTGGTCCAGCAACAGCTCAAATGGCCCTTCAGTTGTTCATC +AAAGATTACAGGTACACGTACCGATGCCATAGAGGTGACACACAAATACAAACCCGAAGATCATTTGAAATAAAGAAACT +GTGGGAGCAAACCCGTTCCAAAGCTGGACTGCTGGTCTCCGACGGAGGCCCAAATTTATACAACATTAGAAATCTCCACA +TTCCTGAAGTCTGCCTAAAATGGGAATTGATGGATGAGGATTACCAGGGGCGTTTATGCAACCCACTGAACCCATTTGTC +AGCCATAAAGAAATTGAATCAATGAACAATGCAGTGATGATGCCAGCACATGGTCCAGCCAAAAACATGGAGTATGATGC +TGTTGCAACAACACACTCCTGGATCCCCAAAAGAAATCGATCCATCTTGAATACAAGTCAAAGAGGAGTACTTGAAGATG +AACAAATGTACCAAAGGTGCTGCAATTTATTTGAAAAATTCTTCCCCAGCAGTTCATACAGAAGACCAGTCGGGATATCC +AGTATGGTGGAGGCTATGGTTTCCAGAGCCCGAATTGATGCACGGATTGATTTCGAATCTGGAAGGATAAAGAAAGAAGA +GTTCACTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAAAAATAGTGAATTTAGCTTGTCCTTCATG +AAAAAATGCCTTGTTCCTACT +>kraken:taxid|211044|NC_002022.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 3, complete sequence +AGCGAAAGCAGGTACTGATCCAAAATGGAAGATTTTGTGCGACAATGCTTCAATCCGATGATTGTCGAGCTTGCGGAAAA +AACAATGAAAGAGTATGGGGAGGACCTGAAAATCGAAACAAACAAATTTGCAGCAATATGCACTCACTTGGAAGTATGCT +TCATGTATTCAGATTTCCACTTCATCAATGAGCAAGGCGAGTCAATAATCGTAGAACTTGGTGATCCTAATGCACTTTTG +AAGCACAGATTTGAAATAATCGAGGGAAGAGATCGCACAATGGCCTGGACAGTAGTAAACAGTATTTGCAACACTACAGG +GGCTGAGAAACCAAAGTTTCTACCAGATTTGTATGATTACAAGGAAAATAGATTCATCGAAATTGGAGTAACAAGGAGAG +AAGTTCACATATACTATCTGGAAAAGGCCAATAAAATTAAATCTGAGAAAACACACATCCACATTTTCTCGTTCACTGGG +GAAGAAATGGCCACAAAGGCCGACTACACTCTCGATGAAGAAAGCAGGGCTAGGATCAAAACCAGGCTATTCACCATAAG +ACAAGAAATGGCCAGCAGAGGCCTCTGGGATTCCTTTCGTCAGTCCGAGAGAGGAGAAGAGACAATTGAAGAAAGGTTTG +AAATCACAGGAACAATGCGCAAGCTTGCCGACCAAAGTCTCCCGCCGAACTTCTCCAGCCTTGAAAATTTTAGAGCCTAT +GTGGATGGATTCGAACCGAACGGCTACATTGAGGGCAAGCTGTCTCAAATGTCCAAAGAAGTAAATGCTAGAATTGAACC +TTTTTTGAAAACAACACCACGACCACTTAGACTTCCGAATGGGCCTCCCTGTTCTCAGCGGTCCAAATTCCTGCTGATGG +ATGCCTTAAAATTAAGCATTGAGGACCCAAGTCATGAAGGAGAGGGAATACCGCTATATGATGCAATCAAATGCATGAGA +ACATTCTTTGGATGGAAGGAACCCAATGTTGTTAAACCACACGAAAAGGGAATAAATCCAAATTATCTTCTGTCATGGAA +GCAAGTACTGGCAGAACTGCAGGACATTGAGAATGAGGAGAAAATTCCAAAGACTAAAAATATGAAAAAAACAAGTCAGC +TAAAGTGGGCACTTGGTGAGAACATGGCACCAGAAAAGGTAGACTTTGACGACTGTAAAGATGTAGGTGATTTGAAGCAA +TATGATAGTGATGAACCAGAATTGAGGTCGCTTGCAAGTTGGATTCAGAATGAGTTCAACAAGGCATGCGAACTGACAGA +TTCAAGCTGGATAGAGCTTGATGAGATTGGAGAAGATGTGGCTCCAATTGAACACATTGCAAGCATGAGAAGGAATTATT +TCACATCAGAGGTGTCTCACTGCAGAGCCACAGAATACATAATGAAGGGGGTGTACATCAATACTGCCTTACTTAATGCA +TCTTGTGCAGCAATGGATGATTTCCAATTAATTCCAATGATAAGCAAGTGTAGAACTAAGGAGGGAAGGCGAAAGACCAA +CTTGTATGGTTTCATCATAAAAGGAAGATCCCACTTAAGGAATGACACCGACGTGGTAAACTTTGTGAGCATGGAGTTTT +CTCTCACTGACCCAAGACTTGAACCACACAAATGGGAGAAGTACTGTGTTCTTGAGATAGGAGATATGCTTCTAAGAAGT +GCCATAGGCCAGGTTTCAAGGCCCATGTTCTTGTATGTGAGGACAAATGGAACCTCAAAAATTAAAATGAAATGGGGAAT +GGAGATGAGGCGTTGTCTCCTCCAGTCACTTCAACAAATTGAGAGTATGATTGAAGCTGAGTCCTCTGTCAAAGAGAAAG +ACATGACCAAAGAGTTCTTTGAGAACAAATCAGAAACATGGCCCATTGGAGAGTCTCCCAAAGGAGTGGAGGAAAGTTCC +ATTGGGAAGGTCTGCAGGACTTTATTAGCAAAGTCGGTATTTAACAGCTTGTATGCATCTCCACAACTAGAAGGATTTTC +AGCTGAATCAAGAAAACTGCTTCTTATCGTTCAGGCTCTTAGGGACAATCTGGAACCTGGGACCTTTGATCTTGGGGGGC +TATATGAAGCAATTGAGGAGTGCCTAATTAATGATCCCTGGGTTTTGCTTAATGCTTCTTGGTTCAACTCCTTCCTTACA +CATGCATTGAGTTAGTTGTGGCAGTGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTGTTTCTACT +>kraken:taxid|211044|NC_002017.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 4, complete sequence +AGCAAAAGCAGGGGAAAATAAAAACAACCAAAATGAAGGCAAACCTACTGGTCCTGTTATGTGCACTTGCAGCTGCAGAT +GCAGACACAATATGTATAGGCTACCATGCGAACAATTCAACCGACACTGTTGACACAGTGCTCGAGAAGAATGTGACAGT +GACACACTCTGTTAACCTGCTCGAAGACAGCCACAACGGAAAACTATGTAGATTAAAAGGAATAGCCCCACTACAATTGG +GGAAATGTAACATCGCCGGATGGCTCTTGGGAAACCCAGAATGCGACCCACTGCTTCCAGTGAGATCATGGTCCTACATT +GTAGAAACACCAAACTCTGAGAATGGAATATGTTATCCAGGAGATTTCATCGACTATGAGGAGCTGAGGGAGCAATTGAG +CTCAGTGTCATCATTCGAAAGATTCGAAATATTTCCCAAAGAAAGCTCATGGCCCAACCACAACACAACCAAAGGAGTAA +CGGCAGCATGCTCCCATGCGGGGAAAAGCAGTTTTTACAGAAATTTGCTATGGCTGACGGAGAAGGAGGGCTCATACCCA +AAGCTGAAAAATTCTTATGTGAACAAGAAAGGGAAAGAAGTCCTTGTACTGTGGGGTATTCATCACCCGTCTAACAGTAA +GGATCAACAGAATATCTATCAGAATGAAAATGCTTATGTCTCTGTAGTGACTTCAAATTATAACAGGAGATTTACCCCGG +AAATAGCAGAAAGACCCAAAGTAAGAGATCAAGCTGGGAGGATGAACTATTACTGGACCTTGCTAAAACCCGGAGACACA +ATAATATTTGAGGCAAATGGAAATCTAATAGCACCAAGGTATGCTTTCGCACTGAGTAGAGGCTTTGGGTCCGGCATCAT +CACCTCAAACGCATCAATGCATGAGTGTAACACGAAGTGTCAAACACCCCTGGGAGCTATAAACAGCAGTCTCCCTTTCC +AGAATATACACCCAGTCACAATAGGAGAGTGCCCAAAATACGTCAGGAGTGCCAAATTGAGGATGGTTACAGGACTAAGG +AACATTCCGTCCATTCAATCCAGAGGTCTATTTGGAGCCATTGCCGGTTTTATTGAAGGGGGATGGACTGGAATGATAGA +TGGATGGTACGGTTATCATCATCAGAATGAACAGGGATCAGGCTATGCAGCGGATCAAAAAAGCACACAAAATGCCATTA +ACGGGATTACAAACAAGGTGAACTCTGTTATCGAGAAAATGAACATTCAATTCACAGCTGTGGGTAAAGAATTCAACAAA +TTAGAAAAAAGGATGGAAAATTTAAATAAAAAAGTTGATGATGGATTTCTGGACATTTGGACATATAATGCAGAATTGTT +AGTTCTACTGGAAAATGAAAGGACTCTGGATTTCCATGACTCAAATGTGAAGAATCTGTATGAGAAAGTAAAAAGCCAAT +TAAAGAATAATGCCAAAGAAATCGGAAATGGATGTTTTGAGTTCTACCACAAGTGTGACAATGAATGCATGGAAAGTGTA +AGAAATGGGACTTATGATTATCCCAAATATTCAGAAGAGTCAAAGTTGAACAGGGAAAAGGTAGATGGAGTGAAATTGGA +ATCAATGGGGATCTATCAGATTCTGGCGATCTACTCAACTGTCGCCAGTTCACTGGTGCTTTTGGTCTCCCTGGGGGCAA +TCAGTTTCTGGATGTGTTCTAATGGATCTTTGCAGTGCAGAATATGCATCTGAGATTAGAATTTCAGAAATATGAGGAAA +AACACCCTTGTTTCTACT +>kraken:taxid|211044|NC_002019.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 5, complete sequence +AGCAAAAGCAGGGTAGATAATCACTCACTGAGTGACATCAAAATCATGGCGTCCCAAGGCACCAAACGGTCTTACGAACA +GATGGAGACTGATGGAGAACGCCAGAATGCCACTGAAATCAGAGCATCCGTCGGAAAAATGATTGGTGGAATTGGACGAT +TCTACATCCAAATGTGCACAGAACTTAAACTCAGTGATTATGAGGGACGGTTGATCCAAAACAGCTTAACAATAGAGAGA +ATGGTGCTCTCTGCTTTTGACGAAAGGAGAAATAAATACCTGGAAGAACATCCCAGTGCGGGGAAGGATCCTAAGAAAAC +TGGAGGACCTATATACAGAAGAGTAAACGGAAAGTGGATGAGAGAACTCATCCTTTATGACAAAGAAGAAATAAGGCGAA +TCTGGCGCCAAGCTAATAATGGTGACGATGCAACGGCTGGTCTGACTCACATGATGATCTGGCATTCCAATTTGAATGAT +GCAACTTATCAGAGGACAAGGGCTCTTGTTCGCACCGGAATGGATCCCAGGATGTGCTCTCTGATGCAAGGTTCAACTCT +CCCTAGGAGGTCTGGAGCCGCAGGTGCTGCAGTCAAAGGAGTTGGAACAATGGTGATGGAATTGGTCAGGATGATCAAAC +GTGGGATCAATGATCGGAACTTCTGGAGGGGTGAGAATGGACGAAAAACAAGAATTGCTTATGAAAGAATGTGCAACATT +CTCAAAGGGAAATTTCAAACTGCTGCACAAAAAGCAATGATGGATCAAGTGAGAGAGAGCCGGGACCCAGGGAATGCTGA +GTTCGAAGATCTCACTTTTCTAGCACGGTCTGCACTCATATTGAGAGGGTCGGTTGCTCACAAGTCCTGCCTGCCTGCCT +GTGTGTATGGACCTGCCGTAGCCAGTGGGTACGACTTTGAAAGAGAGGGATACTCTCTAGTCGGAATAGACCCTTTCAGA +CTGCTTCAAAACAGCCAAGTGTACAGCCTAATCAGACCAAATGAGAATCCAGCACACAAGAGTCAACTGGTGTGGATGGC +ATGCCATTCTGCCGCATTTGAAGATCTAAGAGTATTGAGCTTCATCAAAGGGACGAAGGTGGTCCCAAGAGGGAAGCTTT +CCACTAGAGGAGTTCAAATTGCTTCCAATGAAAATATGGAGACTATGGAATCAAGTACACTTGAACTGAGAAGCAGGTAC +TGGGCCATAAGGACCAGAAGTGGAGGAAACACCAATCAACAGAGGGCATCTGCGGGCCAAATCAGCATACAACCTACGTT +CTCAGTACAGAGAAATCTCCCTTTTGACAGAACAACCGTTATGGCAGCATTCACTGGGAATACAGAGGGGAGAACATCTG +ACATGAGGACCGAAATCATAAGGATGATGGAAAGTGCAAGACCAGAAGATGTGTCTTTCCAGGGGCGGGGAGTCTTCGAG +CTCTCGGACGAAAAGGCAGCGAGCCCGATCGTGCCTTCCTTTGACATGAGTAATGAAGGATCTTATTTCTTCGGAGACAA +TGCAGAGGAGTACGACAATTAAAGAAAAATACCCTTGTTTCTACT +>kraken:taxid|211044|NC_002018.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 6, complete sequence +AGCGAAAGCAGGGGTTTAAAATGAATCCAAATCAGAAAATAATAACCATTGGATCAATCTGTCTGGTAGTCGGACTAATT +AGCCTAATATTGCAAATAGGGAATATAATCTCAATATGGATTAGCCATTCAATTCAAACTGGAAGTCAAAACCATACTGG +AATATGCAACCAAAACATCATTACCTATAAAAATAGCACCTGGGTAAAGGACACAACTTCAGTGATATTAACCGGCAATT +CATCTCTTTGTCCCATCCGTGGGTGGGCTATATACAGCAAAGACAATAGCATAAGAATTGGTTCCAAAGGAGACGTTTTT +GTCATAAGAGAGCCCTTTATTTCATGTTCTCACTTGGAATGCAGGACCTTTTTTCTGACCCAAGGTGCCTTACTGAATGA +CAGGCATTCAAATGGGACTGTTAAGGACAGAAGCCCTTATAGGGCCTTAATGAGCTGCCCTGTCGGTGAAGCTCCGTCCC +CGTACAATTCAAGATTTGAATCGGTTGCTTGGTCAGCAAGTGCATGTCATGATGGCATGGGCTGGCTAACAATCGGAATT +TCAGGTCCAGATAATGGAGCAGTGGCTGTATTAAAATACAACGGCATAATAACTGAAACCATAAAAAGTTGGAGGAAGAA +AATATTGAGGACACAAGAGTCTGAATGTGCCTGTGTAAATGGTTCATGTTTTACTATAATGACTGATGGCCCGAGTGATG +GGCTGGCCTCGTACAAAATTTTCAAGATCGAAAAGGGGAAGGTTACTAAATCAATAGAGTTGAATGCACCTAATTCTCAC +TATGAGGAATGTTCCTGTTACCCTGATACCGGCAAAGTGATGTGTGTGTGCAGAGACAATTGGCATGGTTCGAACCGGCC +ATGGGTGTCTTTCGATCAAAACCTGGATTATCAAATAGGATACATCTGCAGTGGGGTTTTCGGTGACAACCCGCGTCCCA +AAGATGGAACAGGCAGCTGTGGTCCAGTGTATGTTGATGGAGCAAACGGAGTAAAGGGATTTTCATATAGGTATGGTAAT +GGTGTTTGGATAGGAAGGACCAAAAGTCACAGTTCCAGACATGGGTTTGAGATGATTTGGGATCCTAATGGATGGACAGA +GACTGATAGTAAGTTCTCTGTGAGGCAAGATGTTGTGGCAATGACTGATTGGTCAGGGTATAGCGGGAGTTTCGTTCAAC +ATCCTGAGCTAACAGGGCTAGACTGTATAAGGCCGTGCTTCTGGGTTGAATTAATCAGGGGACGACCTAAAGAAAAAACA +ATCTGGACTAGTGCGAGCAGCATTTCTTTTTGTGGCGTGAATAGTGATACTGTAGATTGGTCTTGGCCAGACGGTGCTGA +GTTGCCATTCACCATTGACAAGTAGTCTGTTCAAAAAACTCCTTGTTTCTACT +>kraken:taxid|211044|NC_002016.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 7, complete sequence +AGCGAAAGCAGGTAGATATTGAAAGATGAGTCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTATCATCCCGTCAGGCC +CCCTCAAAGCCGAGATCGCACAGAGACTTGAAGATGTCTTTGCAGGGAAGAACACCGATCTTGAGGTTCTCATGGAATGG +CTAAAGACAAGACCAATCCTGTCACCTCTGACTAAGGGGATTTTAGGATTTGTGTTCACGCTCACCGTGCCCAGTGAGCG +AGGACTGCAGCGTAGACGCTTTGTCCAAAATGCCCTTAATGGGAACGGGGATCCAAATAACATGGACAAAGCAGTTAAAC +TGTATAGGAAGCTCAAGAGGGAGATAACATTCCATGGGGCCAAAGAAATCTCACTCAGTTATTCTGCTGGTGCACTTGCC +AGTTGTATGGGCCTCATATACAACAGGATGGGGGCTGTGACCACTGAAGTGGCATTTGGCCTGGTATGTGCAACCTGTGA +ACAGATTGCTGACTCCCAGCATCGGTCTCATAGGCAAATGGTGACAACAACCAACCCACTAATCAGACATGAGAACAGAA +TGGTTTTAGCCAGCACTACAGCTAAGGCTATGGAGCAAATGGCTGGATCGAGTGAGCAAGCAGCAGAGGCCATGGAGGTT +GCTAGTCAGGCTAGGCAAATGGTGCAAGCGATGAGAACCATTGGGACTCATCCTAGCTCCAGTGCTGGTCTGAAAAATGA +TCTTCTTGAAAATTTGCAGGCCTATCAGAAACGAATGGGGGTGCAGATGCAACGGTTCAAGTGATCCTCTCGCTATTGCC +GCAAATATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCATTTACCGTCGCTTTAA +ATACGGACTGAAAGGAGGGCCTTCTACGGAAGGAGTGCCAAAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAGTG +CTGTGGATGCTGACGATGGTCATTTTGTCAGCATAGAGCTGGAGTAAAAAACTACCTTGTTTCTACT +>kraken:taxid|211044|NC_002020.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 8, complete sequence +AGCAAAAGCAGGGTGACAAAGACATAATGGATCCAAACACTGTGTCAAGCTTTCAGGTAGATTGCTTTCTTTGGCATGTC +CGCAAACGAGTTGCAGACCAAGAACTAGGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAAATCCCTAAGAGG +AAGGGGCAGCACTCTTGGTCTGGACATCGAGACAGCCACACGTGCTGGAAAGCAGATAGTGGAGCGGATTCTGAAAGAAG +AATCCGATGAGGCACTTAAAATGACCATGGCCTCTGTACCTGCGTCGCGTTACCTAACCGACATGACTCTTGAGGAAATG +TCAAGGGAATGGTCCATGCTCATACCCAAGCAGAAAGTGGCAGGCCCTCTTTGTATCAGAATGGACCAGGCGATCATGGA +TAAAAACATCATACTGAAAGCGAACTTCAGTGTGATTTTTGACCGGCTGGAGACTCTAATATTGCTAAGGGCTTTCACCG +AAGAGGGAGCAATTGTTGGCGAAATTTCACCATTGCCTTCTCTTCCAGGACATACTGCTGAGGATGTCAAAAATGCAGTT +GGAGTCCTCATCGGAGGACTTGAATGGAATGATAACACAGTTCGAGTCTCTGAAACTCTACAGAGATTCGCTTGGAGAAG +CAGTAATGAGAATGGGAGACCTCCACTCACTCCAAAACAGAAACGAGAAATGGCGGGAACAATTAGGTCAGAAGTTTGAA +GAAATAAGATGGTTGATTGAAGAAGTGAGACACAAACTGAAGGTAACAGAGAATAGTTTTGAGCAAATAACATTTATGCA +AGCCTTACATCTATTGCTTGAAGTGGAGCAAGAGATAAGAACTTTCTCATTTCAGCTTATTTAATAATAAAAAACACCCT +TGTTTCTACT diff --git a/data/FluA_H2N2.fa b/data/FluA_H2N2.fa new file mode 100644 index 0000000..a8856d3 --- /dev/null +++ b/data/FluA_H2N2.fa @@ -0,0 +1,180 @@ +>kraken:taxid|488241|NC_007378.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 1, complete sequence +AGCAAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAACTACGGAATCTGATGTCGCAGTCTCGCACTCGCGA +GATACTAACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCAGGGAGACAGGAAAAGAACCCGTCAC +TTAGGATGAAATGGATGATGGCAATGAAATATCCAATTACAGCTGACAAGAGGATAACAGAAATGGTTCCTGAGAGAAAT +GAGCAAGGACAAACTCTATGGAGTAAAATGAGTGATGCCGGGTCAGATCGAGTAATGGTATCACCTTTGGCAGTGACATG +GTGGAATAGAAATGGACCAATGACAAGTACGGTTCATTATCCAAAAATCTACAAGACTTATTTTGAGAAAGTCGAAAGGT +TAAAACATGGAACCTTTGGCCCTGTCCATTTTAGAAACCAAGTCAAAATACGCCGAAGAGTTGACATAAACCCTGGTCAT +GCAGACCTCAGTGCCAAGGAGGCACAAGACGTAATCATGGAAGTTGTTTTCCCCAATGAAGTGGGGGCCAGGATACTAAC +GTCGGAATCACAATTAACAATAACCAAAGAGAAAAAAGAAGAACTCCAAGATTGCAAAATTTCTCCTTTGATGGTTGCAT +ACATGTTAGAGAGAGAACTTGTCCGAAAAACGAGATTTCTCCCAGTTGCTGGTGGAACAAGCAGTGTGTACATTGAAGTG +TTACACTTGACTCAAGGAACATGTTGGGAACAGATGTACACCCCAGGTGGAGAAGTGAGGAATGATGATGTTGATCAAAG +TCTAATTATTGCAGCCAGGAACATAGTGAGAAGAGCAGCAGTATCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCC +ACAGCACACAGATTGGCGGGACAAGGATGGTGGACATTCTTAGGCAGAACCCAACGGAAGAACAAGCTGTGGATATATGC +AAGGCTGCAATGGGACTGAGAATCAGCTCATCCTTCAGTTTTGGCGGGTTCACATTTAAGAGAACAAGCGGGTCATCAAT +CAAGAGAGAGGAAGAAGTGCTTACGGGCAATCTCCAAACATTGAAAATAAGGGTGCATGAGGGGTACGAGGAATTCACAA +TGGTGGGGAAAAGGGCAACAGCTATACTCAGAAAAGCAACCAGGAGATTGGTTCAGCTGATAGTGAGTGGAAGAGACGAA +CAGTCAATAGCCGAAGCAATAATTGTAGCCATGGTGTTTTCACAAGAAGATTGCATGATAAAAGCAGTTAGAGGTGACCT +GAATTTCGTTAATAGGGCAAATCAGCGATTGAATCCCATGCATCAACTTTTAAGACATTTTCAGAAAGATGCAAAAGTGC +TCTTTCAAAATTGGGGAATTGAACATATCGACAATGTAATGGGAATGATTGGAGTATTACCAGACATGACTCCAAGCACA +GAGATGTCAATGAGAGGGATAAGAGTCAGCAAAATGGGCGTGGATGAATACTCCAGCACAGAGAGGGTAGTGGTAAGCAT +TGACCGGTTTTTGAGAGTTCGAGACCAACGAGGAAATGTACTACTATCTCCTGAGGAGGTCAGTGAAACACAGGGGACAG +AGAAACTGACAATAACTTACTCATCGTCAATGATGTGGGAGATTAATGGCCCTGAGTCAGTGTTGGTCAATACCTATCAG +TGGATCATCAGAAACTGGGAAACTGTTAAAATTCAATGGTCTCAGAATCCTACAATGCTATACAATAAAATGGAATTTGA +GCCATTTCAGTCTTTAGTTCCTAAGGCCATTAGAGGCCAATACAGTGGATTTGTTAGGACTCTATTCCAACAAATGAGGG +ATGTACTTGGGACATTTGATACCACCCAGATAATAAAGCTTCTTCCCTTTGCAGCCGCCCCACCAAAGCAAAGTAGAATG +CAGTTCTCTTCATTGACTGTGAATGTGAGGGGATCAGGAATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTA +CAACAAGACCACTAAGAGACTAACAATTCTCGGAAAGGATGCTGGCACTTTAACTGAAGACCCAGATGAAGGCACATCCG +GAGTGGAGTCCGCTGTTCTGAGAGGATTCCTCATTCTGGGCAAGGAAGATAGAAGATATGGACCAGCATTAAGCATCAAT +GAACTGAGTACCCTTGCAAAAGGAGAAAAGGCTAATGTACTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA +ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAATGTTGAATAGTTT +AAAAACGACCTTGTTTCTACT +>kraken:taxid|488241|NC_007375.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 2, complete sequence +AGCAAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAG +TACTACATTCCCTTATACTGGAGATCCTCCATACAGCCATGGGACAGGAACAGGATACACCATGGACACAGTCAACAGAA +CACATCAATATTCAGAAAAGGGGAAGTGGACAACAAACACGGAAACTGGAGCGCCCCAACTTAACCCAATTGATGGACCA +CTACCTGAGGACAATGAACCAAGTGGATATGCACAAACAGACTGCGTCCTGGAAGCAATGGCTTTCCTTGAGGAATCACA +CCCAGGAATCTTTGAAAATTCGTGTCTTGAAACGATGGAAGTTATTCAACAAACAAGAGTGGACAAACTGACCCAAGGTC +GTCAGACCTATGACTGGACATTGAACAGAAATCAGCCGGCTGCAACTGCGCTAGCCAACACTATAGAGGTCTTCAGATCG +AATGGACTGACAGCTAATGAGTCGGGAAGGCTAATAGATTTCCTCAAGGATGTGATAGAATCAATGGATAAAGAGGAGAT +GGAAATAACAACACACTTCCAAAGAAAAAGAAGAGTAAGAGACAACATGACCAAGAAAATGGTCACACAACGAACAATAG +GAAAGAAGAAGCAAAGATTGAACAAGAGAAGCTATCTGATAAGAGCACTGACATTGAACACAATGACTAAAGATGCAGAG +AGAGGTAAATTAAAAAGAAGAGCAATTGCAACACCCGGTATGCAGATCAGAGGGTTCGTGCACTTTGTCGAAACACTAGC +GAGAAATATTTGTGAGAAACTTGAACAGTCTGGGCTTCCGGTTGGAGGTAATGAAAAGAAGGCTAAACTAGCAAATGTTG +TTAGAAAAATGATGACTAATTCACAAGACACAGAGCTCTCTTTCACAATTACTGGAGACAACACCAAATGGAATGAGAAT +CAAAATCCTCGAGTGTTTCTGGCGATGATAACATACATCACAAGAAATCAACCTGAATGGTTTAGAAACGTCCTGAGCAT +TGCACCCATAATGTTCTCAAATAAAATGGCTAGACTAGGGAAAGGTTACATGTTCGAAAGCAAGAGCATGAAGCTCCGAA +CACAAATACCAGCAGAAATGCTAGCAAGTATTGACCTGAAATACTTTAATGAATCAACCAGAAAGAAAATTGAGAAAATA +AGGCCTCTCCTAATAGATGGCACAGTCTCATTGAGTCCTGGAATGATGATGGGCATGTTCAACATGCTAAGTACAGTCTT +AGGAGTCTCAATCCTGAATCTCGGGCAAAAGAAATACACCAAAACNACATACTGGTGGGACGGACTCCAATCCTCTGATG +ACTTCGCTCTCATAGTGAATGCACCAAATCATGAGGGAATACAAGCAGGGGTGAATAGATTCTACAGAACCTGCAAGCTA +GTCGGAATCAATATGAGCAAAAAGAAGTCCTACATAAATAGGACAGGGACATTTGAATTCACAAGCTTTTTCTATCGCTA +TGGATTTGTAGCCAATTTTAGCATGGAGCTGCCCAGCTTTGGAGTGTCTGGAATTAATGAATCGGCTGATATGAGCATTG +GGGTAACAGTGATAAAGAACAATATGATAAATAATGACCTTGGGCCAGCAACAGCCCAAATGGCTCTTCAACTATTCATC +AAAGACTACAGATACACGTACCGGTGCCACAGAGGGGACACACAAATTCAGACAAGGAGATCATTCGAGCTAAAGAAGCT +GTGGGAGCAAACCCGCTCAAAGGCAGGACTTTTGGTGTCGGATGGAGGATCAAACTTATACAATATCCGGAATCTCCACA +TTCCAGAAGTCTGCTTGAAATGGGAGCTAATGGATGAAGACTATCAGGGGAGGCTTTGTAATCCCCTGAATCCATTTGTC +AGTCATAAGGAAATTGAGTCTGTAAACAATGCTGTGGTAATGCCAGCTCACGGTCCAGCCAAGAGCATGGAATATGATGC +TGTTGCTACTACACACTCCTGGACCCCTAAGAGGAACCGCTCCATTCTCAACACAAGCCAAAGGGGAATTCTTGAAGATG +AACAGATGTATCAGAAGTGTTGCAATCTATTTGAGAAATTCTTCCCTAGCAGTTCGTACAGGAGACCAGTTGGAATTTCC +AGCATGGTGGAGGCCATGGTGTCTAGGGCTCGGATTGATGCACGGATTGACTTCGAGTCTGGACGGATTAAGAAAGAGGA +GTTCGCTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAAAAATAGTGAATTTAGCTTGTCCTTCATG +AAAAAATGCCTTGTTTCTACT +>kraken:taxid|488241|NC_007376.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 3, complete sequence +AGCAAAAGCAGGTACTGATTCGAAATGGAAGATTTTGTGCGACAATGCTTCAATCCGATGATTGTCGAACTTGCGGAAAA +GGCAATGAAAGAGTATGGAGAAGATCTGAAAATCGAAACAAACAAATTTGCAGCAATATGCACTCACTTGGAAGTATGCT +TCATGTATTCAGATTTTCATTTCATCAATGAGCAAGGCGAGTCAATAATGGTAGAGCTTGATGATCCAAATGCACTTTTG +AAGCACAGATTTGAAATAATAGAGGGAAGAGATCGCACAATGGCCTGGACAGTAGTAAACAGTATTTGCAACACCACAGG +AGCTGAGAAACCGAAGTTTCTGCCAGATTTGTATGATTACAAGGAGAATAGATTCATCGAGATTGGAGTGACAAGGAGAG +AAGTCCACATATACTATCTTGAAAAGGCCAATAAAATTAAATCTGAGAATACACACATCCACATTTTCTCATTCACTGGG +GAAGAAATGGCCACAAAGGCCGACTACACTCTCGATGAGGAAAGCAGGGCTAGGATCAAAACCAGACTATTCACCATAAG +ACAAGAAATGGCCAACAGAGGCCTCTGGGATTCCTTTCGTCAGTCCGAAAGAGGCGAAGAAACAATTGAAGAAAGATTTG +AAATCACAGGGACAATGCGCAGGCTTGCCGACCAAAGTCTCCCGCCGAACTTCTCCTGCCTTGAGAATTTTAGAGCCTAT +GTGGATGGATTCGAACCGAACGGCTACATTGAGGGCAAGCTTTCTCAAATGTCCAAAGAAGTAAATGCAAAAATTGAACC +TTTTCTGAAAACAACACCAAGACCAATTAGACTTCCGGATGGGCCTCCTTGTTTTCAGCGGTCCAAATTCCTGCTGATGG +ATGCTTTAAAATTAAGCATTGAGGACCCAAGTCACGAAGGGGAGGGAATACCACTATATGATGCGATCAAGTGCATGAGA +ACATTCTTTGGATGGAAAGAACCCTATATTGTTAAACCACACGAAAAGGGAATAAATCCAAATTATCTGCTGTCATGGAA +GCAAGTACTGGCGGAACTGCAGGACATTGAGAATGAGGAGAAGATTCCAAGAACTAAAAACATGAAGAAAACGAGTCAGC +TAAAGTGGGCACTTGGTGAGAACATGGCACCAGAGAAGGTAGACTTTGACAACTGTAGAGACATAAGCGATTTGAAGCAA +TATGATAGTGACGAACCTGAATTAAGGTCACTTTCAAGCTGGATCCAGAATGAGTTCAACAAGGCATGCGAGCTGACCGA +TTCAATCTGGATAGAGCTCGATGAGATTGGAGAAGACGTGGCTCCAATTGAACACATTGCAAGCATGAGAAGGAATTACT +TCACAGCAGAGGTGTCCCATTGCAGAGCCACAGAATATATAATGAAGGGGGTATACATTAATACTGCCTTGCTTAATGCA +TCCTGTGCAGCAATGGACGATTTCCAACTAATTCCCATGATAAGCAAGTGTAGAACTAAAGAGGGAAGGCGAAAGACCAA +TTTATATGGTTTCATCATAAAAGGAAGATCTCACTTAAGGAATGACACCGACGTGGTAAACTTTGTGAGCATGGAGTTTT +CTCTCACTGACCCGAGACTTGAGCCACACAAATGGGAGAAGTACTGTGTCCTTGAGATAGGAGATATGCTACTAAGAAGT +GCCATAGGCCAGATGTCAAGGCCTATGTTCTTGTATGTGAGGACAAATGGAACATCAAAGATTAAAATGAAATGGGGAAT +GGAGATGAGGCCTTGCCTCCTTCAGTCACTACAACAAATCGAGAGTATGGTTGAAGCCGAGTCCTCTGTCAAAGAGAAAG +ACATGACCAAAGAGTTTTTTGAGAATAAATCAGAAACATGGCCCATTGGGGAGTCCCCCAAAGGAGTGGAAGAAGGTTCC +ATTGGGAAGGTCTGCAGGACTTTATTAGCCAAGTCGGTATTCAATAGCCTGTATGCATCCCCACAATTAGAAGGATTTTC +AGCTGAATCAAGAAAACTGCTTCTTGTCGTTCAGGCTCTTAGGGACAATCTTGAACCTGGAACCTTTGATCTTGGGGGGC +TATATGAAGCAATTGAGGAGTGCCTGATTAATGATCCCTGGGTTTTGCTTAATGCGTCTTGGTTCAACTCCTTCCTAACA +CATGCATTAAGATAGTTGTGGCAATGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTGTTTCTACT +>kraken:taxid|488241|NC_007374.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 4, complete sequence +AGCAAAAGCAGGGGTTATACCATAGACAACCAAAAGCATAACAATGGCCATCATTTATCTCATACTCCTGTTCACAGCAG +TGAGGGGGGACCAGATATGCATTGGATACCATGCCAATAATTCCACAGAAAAGGTCGACACAATTCTAGAGCGGAATGTC +ACTGTGACTCATGCCAAGGACATCCTTGAGAAGACCCATAACGGAAAGCTATGCAAACTAAACGGAATCCCTCCACTTGA +ACTAGGGGACTGTAGCATTGCCGGATGGCTCCTTGGAAATCCAGAATGTGATAGGCTTCTAAGTGTGCCAGAATGGTCCT +ATATAATGGAGAAAGAAAACCCGAGATACAGTTTGTGTTACCCAGGCAGCTTCAATGACTATGAAGAATTGAAACATCTC +CTCAGCAGCGTGAAACATTTTGAGAAAGTTAAGATTTTGCCCAAAGATAGATGGACACAGCATACAACAACTGGAGGTTC +ATGGGCCTGCGCGGTGTCAGGTAAACCATCATTCTTCAGGAACATGGTCTGGCTGACACGTAAAGGATCAAATTATCCGG +TTGCCAAAGGATCGTACAACAATACAAGCGGAGAACAAATGCTAATAATTTGGGGAGTGCACCATCCTAATGATGAGGCA +GAACAAAGAGCATTGTACCAGAATGTGGGAACCTATGTTTCCGTAGCCACATCAACATTGTACAAAAGGTCAATCCCAGA +AATAGCAGCAAGGCCTAAAGTGAATGGACTAGGACGTAGAATGGAATTCTCTTGGACCCTCTTGGATATGTGGGACACCA +TAAATTTTGAGAGCACTGGTAATCTAGTTGCACCAGAGTATGGGTTCAAAATATCGAAAAGAGGTAGTTCAGGGATCATG +AAGACAGAAGGAACACTTGAGAACTGTGAAACCAAATGCCAAACTCCTTTGGGAGCAATAAATACAACACTACCTTTTCA +CAATGTCCACCCACTGACAATAGGTGAATGCCCCAAATATGTAAAATCGGAGAAATTGGTCTTAGCAACAGGACTAAGGA +ATGTTCCCCAGATTGAATCAAGAGGATTGTTTGGGGCAATAGCTGGTTTTATAGAAGGAGGATGGCAAGGAATGGTTGAT +GGTTGGTATGGATACCATCACAGCAATGACCAGGGATCAGGGTATGCAGCAGACAAAGAATCCACTCAAAAGGCATTTAA +TGGAATCACCAACAAGGTAAATTCTGTGATTGAAAAGATGAACACCCAATTTGAAGCTGTTGGGAAAGAATTCAGTAACT +TAGAGAAAAGACTGGAGAACTTGAACAAAAAGATGGAAGACGGGTTTCTAGATGTGTGGACATACAATGCAGAGCTTCTA +GTTCTGATGGAAAATGAGAGGACACTTGACTTTCATGATTCTAATGTCAAGAATCTGTATGATAAAGTCAGAATGCAGCT +GAGAGACAACGTCAAAGAACTAGGAAATGGATGTTTTGAATTTTATCACAAATGTGACAATGAATGCATGGATAGTGTGA +AAAACGGGACATATGATTATCCCAAGTATGAAGAAGAATCTAAACTAAATAGAAATGAAATCAAAGGGGTAAAATTGAGC +AGCATGGGGGTTTATCAAATCCTTGCCATTTATGCTACAGTAGCAGGTTCTCTGTCACTGGCAATCATGATGGCTGGGAT +CTCTTTCTGGATGTGCTCCAACGGGTCTCTGCAGTGCAGAATCTGCATATGATTGTAAGTCATTTTATAATTAAAAACAC +CCTTGTTTCCTGA +>kraken:taxid|488241|NC_007381.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 5, complete sequence +ATGGCGTCCCAAGGCACCAAACGGTCTTATGAACAGATGGAAACTGATGGGGAACGCCAGAATGCAACTGAGATCAGAGC +ATCCGTCGGGAAGATGATTGATGGAATTGGACGATTCTACATCCAAATGTGCACCGAACTTAAACTCAGTGATTATGAGG +GGCGACTGATCCAGAACAGCTTAACAATAGAGAGAATGGTGCTCTCTGCTTTTGACGAGAGAAGGAATAAATATCTGGAA +GAACATCCCAGCGCGGGGAAGGATCCTAAGAAAACTGGAGGACCCATATACAAGAGAGTAGATGGAAAGTGGATGAGGGA +ACTCGTCCTTTATGACAAAGAAGAAATAAGGCGAATCTGGCGCCAAGCCAATAATGGTGATGATGCAACAGCTGGGCTGA +CTCACATGATGATCTGGCATTCCAATTTGAATGATACAACATACCAGAGGACAAGAGCTCTTGTTCGCACCGGAATGGAT +CCCAGGATGTGCTCTTTGATGCAGGGTTCGACTCTCCCTAGGAGGTCTGGAGCTGCAGGCGCTGCAGTCAAAGGAGTTGG +GACAATGGTGATGGAGTTGATCAGGATGATCAAACGTGGGATCAATGATCGGAACTTCTGGAGAGGTGAGAATGGACGGA +AAACAAGGAGTGCTTACGAGAGAATGTGCAACATTCTCAAAGGAAAATTTCAAACAGCTGCACAAAGAGCAATGATGGAT +CAAGTGAGAGAAAGCCGGAACCCAGGAAATGCTGAGATCGAAGATCTAATCTTTCTGGCACGGTCTGCACTCATATTGAG +AGGGTCAGTTGCTCACAAATCTTGTCTGCCCGCCTGTGTGTATGGACCTGCCATAGCCAGTGGGTACAACTTCGAAAAAG +AGGGATACTCTCTAGTGGGAATAGACCCTTTCAAACTGCTTCAAAACAGCCAAGTATACAGCCTAATCAGACCGAACGAG +AATCCAGCACACAAGAGTCAGCTGGTGTGGATGGCATGCAATTCTGCTGCATTTGAAGATCTAAGAGTATTAAGCTTCAT +CAGAGGGACCAAAGTATCCCCAAGGGGGAAACTTTCCACTAGAGGAGTACAAATTGCTTCAAATGAAAACATGGATACTA +TGGAATCAAGTACTCTTGAACTAAGAAGCAGGTACTGGGCCATAAGGACCAGAAGTGGAGGAAACACTAATCAACAGAGG +GCCTCTGCAGGTCAAATCAGTGTACAACCTGCATTTTCTGTGCAAAGAAACCTCCCATTTGACAAACCAACCATCATGGC +AGCATTCACTGGGAATACAGAGGGAAGAACATCAGACATGAGGGCAGAAATCATAAGGATGATGGAAGGTGCAAAACCAG +AAGAAATGTCCTTCCAGGGGCGGGGAGTCTTCGAGCTCTCGGACGAAAAGGCAACGAACCCGATCGTGCCCTCTTTTGAC +ATGAGTAATGAAGGATCTTATTTCTTCGGAGACAATGCAGAGGAGTACGACAATTAA +>kraken:taxid|488241|NC_007382.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 6, complete sequence +ATGAATCCAAATCAAAAGATAATAACAATTGGCTCTGTCTCTCTCACCATTGCAACAGTATGCTTCCTCATGCAGATTGC +CATCCTGGTAACTACTGTGACATTGCATTTTAAGCAACATGAGTGCGACTCCCCCGCGAGCAACCAAGTAATGCCGTGTG +AACCAATAATAATAGAAAGGAACATAACAGAGATAGTGTATTTGAATAACACCACCATAGAGAAAGAGATCTGCCCCGAA +GTAGTGGAATACAGAAATTGGTCAAAGCCGCAATGTCAAATTACAGGATTTGCACCTTTTTCTAAGGACAATTCAATCCG +GCTTTCTGCTGGTGGGGACATTTGGGTGACGAGAGAACCTTATGTGTCATGCGATCCTGGCAAGTGTTATCAATTTGCAC +TCGGGCAGGGGACCACACTAGACAACAAACATTCAAATGACACAATACATGATAGAATCCCTCATCGAACCCTATTAATG +AATGAGTTGGGTGTTCCATTTCATTTAGGAACCAGGCAAGTGTGTGTAGCATGGTCCAGCTCAAGTTGTCACGATGGAAA +AGCATGGTTGCATGTTTGTGTCACTGGGGATGATAAAAATGCAACTGCTAGCTTCATTTATGACGGGAGGCTTATGGACA +GTATTGGTTCATGGTCTCAAAATATCCTCAGGACCCAGGAGTCGGAATGCGTTTGTATCAATGGGACTTGCACAGTAGTA +ATGACTGATGGAAGTGCTTCAGGAAGAGCCGATACTAGAATACTATTCATTGAAGAGGGGAAAATTGTCCATATTAGCCC +ATTGTCAGGAAGTGCTCAGCATGTAGAGGAGTGTTCCTGTTATCCTCGATATCCTGACGTCAGATGTATCTGCAGAGACA +ACTGGAAAGGCTCTAATAGGCCCGTCATAGACATAAATATGGAAGATTATAGCATTGATTCCAGTTATGTGTGCTCAGGG +CTTGTTGGCGACACACCCAGAAACGACGACAGATCTAGCAATAGTAATTGCAGGAATCCTAACAATGAGAGAGGGAATCC +AGGAGTGAAAGGCTGGGCCTTTGACAATGGAGATGACGTGTGGATGGGAAGAACGATCAGCAAGGATTTACGCTCAGGTT +ATGAAACTTTCAAAGTCATTGGTGGTTGGTCCACACCTAATTCCAAATCGCAGATCAATAGACAGGTCATAGTTGACAGC +AATAATTGGTCAGGTTACTCTGGTATTTTCTCTGTTGAGGGCAAAAGATGCATCAATAGGTGCTTTTATGTGGAGTTGAT +AAGGGGAAGGCAACAGGAGACTAGAGTATGGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACTTCAGGTACTTATG +GAACAGGCTCATGGCCTGATGGGGCGAACATCAATTTCATGCCTATATAA +>kraken:taxid|488241|NC_007377.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 7, complete sequence +AGCAAAAGCAGGTAGATATTGAAAGATGAGCCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTATCGTCCCGTCAGGCC +CCCTCAAAGCCGAGATCGCACAGAGACTTGAAGATGTCTTTGCTGGGAAGAACACAGATCTTGAGGCTCTCATGGAATGG +CTAAAGACAAGACCAATCCTGTCACCTCTGACTAAGGGGATTTTGGGATTTGTATTCACGCTCACCGTGCCAAGTGAGCG +AGGACTGCAGCGTAGACGCTTTGTCCAAAATGCCCTCAATGGGAATGGGGATCCAAATAACATGGACAGAGCAGTTAAAC +TGTATAGAAAGCTTAAGAGGGAGATAACATTCCATGGGGCCAAAGAAGTAGCGCTCAGTTATTCTGCTGGTGCACTTGCC +AGTTGCATGGGCCTCATATACAACAGGATGGGGGCTGTGACCACTGAAGTGGCCTTTGCCGTGGTATGTGCAACCTGTGA +ACAGATTGCTGACTCCCAGCATAGGTCTCACAGGCAAATGGTGACAACAACCAATCCACTAATAAGACATGAGAACAGAA +TGGTTCTGGCCAGCACTACAGCTAAGGCTATGGAGCAAATGGCTGGATCGAGTGAGCAAGCAGCAGAGGCCATGGAGGTT +GCTAGTCAGGCCAGGCAAATGGTGCAGGCAATGAGAGCCATTGGGACTCCTCCTAGCTCCAGTGCTGGTCTAAAAGATGA +TCTTCTTGAAAATTTGCAGGCCTATCAGAAACGAATGGGGGTGCAGATGCAACGATTCAAGTGACCCCCTTGTTGTTGCT +GCGAGTATCATTGGGATCTTGCACTTTATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCATTTATCGCTTCTTTAA +ACACGGTCTGAAAAGAGGGCCTTCTACGGAAGGAGTACCTGAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAGTG +CTGTGGATGCTGACGATAGTCATTTTGTCAGCATAGAGCTGGAGTAAAAAACTACCTTGTTTCTACT +>kraken:taxid|488241|NC_007380.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 8, complete sequence +ATGGATTCTAACACTGTGTCAAGTTTTCAGGTAGATTGCTTCCTTTGGCATGTCCGAAAACAAGTTGTAGACCAAGAACT +AGGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAAGTCCCTAAGGGGAAGAGGCAGCACTCTCGATCTAGACA +TCGAAGCAGCCACCCGTGTTGGAAAGCAGATAGTAGAGAGGATTCTGAAGGAAGAATCCGATGAGGCACTTAAAATGACC +ATGGCCTCCGCACCTGCTTCGCGATACCTAACTGACATGACTATTGAGGAATTGTCAAGGGACTGGTTCATGCTAATGCC +CAAGCAGAAAGTGGAAGGCCCTCTTTGCATCAGAATAGACCAGGCAATCATGGATAAGAACATCATGTTGAAAGCGAATT +TCAGTGTGATTTTTGACCGGCTAGAGACCCTAATATTACTAAGGGCTTTCACCGAAGAGGGAGCAATTGTTGGCGAAATT +TCACCATTGCCTTCTCTTCCAGGACATACTATTGAGGATGTCAAAAATGCAATTGGGGTCCTCATCGGAGGACTTGAATG +GAATGATAACACAGTTCGAGTCTCTAAAACTCTACAGAGATTCGCTTGGAGAAGCAGTAATGAGAATGGGAGACCTCCAC +TCACTCCAAAACAGAAACGGAAAATGGCGAGAACAATTAGGTCAAAAGTTCGAAGAGATAAGATGGCTGATTGAAGAAGT +GAGACACAGATTGAAGATAACAGAGAATAGTTTTGAGCAAATAACATTTATGCAAGCCTTACAGCTACTATTTGAAGTGG +AACAAGAGATAAGAACTTTCTCGTTTCAGCTTATTTAA diff --git a/data/FluA_H3N2.fa b/data/FluA_H3N2.fa new file mode 100644 index 0000000..092391b --- /dev/null +++ b/data/FluA_H3N2.fa @@ -0,0 +1,183 @@ +>kraken:taxid|335341|NC_007373.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 1, complete sequence +AGCAAAAGCAGGTCAATTATATTCAGTATGGAAAGAATAAAAGAACTACGGAACCTGATGTCGCAGTCTCGCACTCGCGA +GATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCGGGGAGACAGGAAAAGAACCCGTCAC +TTAGGATGAAATGGATGATGGCAATGAAATACCCAATCACTGCTGACAAAAGGATAACAGAAATGGTTCCGGAGAGAAAT +GAACAAGGACAAACTCTATGGAGTAAAATGAGTGATGCTGGATCAGATCGAGTGATGGTATCACCTTTGGCTGTAACATG +GTGGAATAGAAATGGACCCGTGGCAAGTACGGTCCATTACCCAAAAGTATACAAGACTTATTTTGACAAAGTCGAAAGGT +TAAAACATGGAACCTTTGGCCCTGTTCATTTTAGAAATCAAGTCAAGATACGCAGAAGAGTAGACATAAACCCTGGTCAT +GCAGACCTCAGTGCCAAAGAGGCACAAGATGTAATTATGGAAGTTGTTTTTCCCAATGAAGTGGGAGCCAGGATACTAAC +ATCAGAATCGCAATTAACAATAACTAAAGAGAAAAAAGAAGAACTCCGAGATTGCAAAATTTCTCCCTTGATGGTTGCAT +ACATGTTAGAGAGAGAACTTGTCCGAAAAACAAGATTTCTCCCAGTTGCTGGCGGAACAAGCAGTATATACATTGAAGTC +TTACATTTGACTCAAGGAACGTGTTGGGAACAAATGTACACTCCAGGTGGAGAAGTGAGGAATGACGATGTTGACCAAAG +CCTAATTATTGCGGCCAGGAACATAGTAAGAAGAGCTGCAGTATCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCC +ACAGCACACAAATTGGCGGGACAAGGATGGTGGACATTCTTAGACAGAACCCGACTGAAGAACAAGCTGTGGATATATGC +AAGGCTGCAATGGGATTGAGAATCAGCTCATCCTTCAGCTTTGGTGGGTTTACATTTAAAAGAACAAGCGGGTCATCAGT +CAAAAAAGAGGAAGAAGTGCTTACAGGCAATCTCCAAACATTGAAGATAAGAGTACATGAGGGGTATGAGGAGTTCACAA +TGGTGGGGAAAAGAGCAACAGCTATACTCAGAAAAGCAACCAGAAGATTGGTTCAGCTCATAGTGAGTGGAAGAGACGAA +CAGTCAATAGCCGAAGCAATAATCGTGGCCATGGTGTTTTCACAAGAGGATTGCATGATAAAAGCAGTTAGAGGTGACCT +GAATTTCGTCAACAGAGCAAATCAACGGTTGAACCCCATGCATCAGCTTTTAAGGCATTTTCAGAAAGATGCGAAAGTGC +TTTTTCAAAATTGGGGAATTGAACACATCGACAGTGTGATGGGAATGGTTGGAGTATTACCAGATATGACTCCAAGCACA +GAGATGTCAATGAGAGGAATAAGAGTCAGCAAAATGGGTGTGGATGAATACTCCAGTACAGAGAGGGTGGTGGTTAGCAT +TGATCGGTTTTTGAGAGTTCGAGACCAACGCGGGAATGTATTATTGTCTCCTGAGGAGGTCAGTGAAACACAGGGAACTG +AAAGATTGACAATAACATATTCATCGTCGATGATGTGGGAGATTAACGGTCCTGAGTCGGTTTTGGTCAATACCTATCAA +TGGATCATCAGAAATTGGGAAGCTGTCAAAATTCAATGGTCTCAGAATCCTGCAATGTTGTACAACAAAATGGAATTTGA +ACCATTTCAATCTTTAGTCCCCAAGGCCATTAGAAGCCAATACAGTGGGTTTGTCAGAACTCTATTCCAACAAATGAGAG +ACGTACTTGGGACATTTGACACCACCCAGATAATAAAGCTTCTCCCTTTTGCAGCCGCTCCACCAAAGCAAAGCAGAATG +CAGTTCTCTTCACTGACTGTAAATGTGAGGGGATCAGGGATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTA +CAACAAGACCACTAAAAGACTAACAATTCTCGGAAAAGATGCCGGCACTTTAATTGAAGACCCAGATGAAAGCACATCCG +GAGTGGAGTCCGCCGTCTTGAGAGGGTTTCTCATTATAGGTAAGGAAGACAGAAGATACGGACCAGCATTAAGCATCAAT +GAACTGAGTAACCTTGCAAAAGGGGAAAAGGCTAATGTGCTAATCGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA +ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAATGTTGAATAGTTT +AAAAACGACCTTGTTTCTACT +>kraken:taxid|335341|NC_007372.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 2, complete sequence +AGCAAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACTCTACTGTTCCTAAAGGTTCCAGCGCAAAATGCCATAAG +CACCACATTCCCTTATACTGGAGATCCTCCATACAGCCATGGAACAGGAACAGGATACACCATGGACACAGTCAACAGAA +CACACCAATATTCAGAGAAGGGGAAGTGGACGACAAATACAGAAACTGGGGCACCCCAACTCAACCCAATTGATGGACCA +CTACCTGAGGATAATGAGCCAAGTGGATATGCACAAACAGACTGTGTCCTGGAGGCTATGGCCTTCCTTGAAGAATCCCA +CCCAGGTATCTTTGAGAACTCATGCCTTGAAACAATGGAAGTCGTTCAACAAACAAGGGTGGACAAACTAACCCAAGGCC +GCCAGACTTATGATTGGACATTAAACAGAAATCAACCGGCAGCAACTGCATTAGCCAACACCATAGAAGTTTTTAGATCG +AATGGACTAACAGCCAATGAATCAGGAAGGCTAATAGATTTCCTCAAGGATGTGATGGAATCAATGGATAAAGAGGAAAT +GGAGATAACAACACACTTTCAAAGAAAAAGGAGAGTAAGAGACAACATGACCAAGAAAATGGTCACACAAAGAACAATAG +GGAAGAAAAAACAAAGAGTGAATAAGAGAGGCTATCTAATAAGAGCTTTGACATTGAACACGATGACCAAAGATGCAGAG +AGAGGTAAATTAAAAAGAAGGGCTATTGCAACACCCGGGATGCAAATTAGAGGGTTCGTGTACTTCGTTGAAACTTTAGC +TAGAAGCATTTGCGAAAAGCTTGAACAGTCTGGACTTCCGGTTGGGGGTAATGAAAAGAAGGCCAAACTGGCAAATGTTG +TGAGAAAAATGATGACTAATTCACAAGACACTGAGCTTTCTTTCACAATCACTGGGGACAACACTAAGTGGAATGAAAAT +CAAAACCCTCGAATGTTTTTGGCGATGATTACATATATCACAAAAAATCAACCTGAGTGGTTCAGAAACATCCTGAGCAT +CGCACCAATAATGTTCTCAAACAAAATGGCAAGACTAGGAAAAGGATACATGTTCGAGAGTAAGAGAATGAAGCTCCGAA +CACAAATACCCGCAGAAATGCTAGCAAGCATTGACCTGAAGTATTTCAATGAATCAACAAGGAAGAAAATTGAGAAAATA +AGGCCTCTTCTAATAGATGGCACAGCATCATTGAGCCCTGGGATGATGATGGGCATGTTCAACATGCTAAGTACGGTTTT +AGGAGTCTCGGTACTGAATCTTGGGCAAAAGAAATACACCAAGACAACATACTGGTGGGATGGGCTCCAATCCTCCGACG +ATTTTGCCCTCATAGTGAATGCACCAAATCATGAGGGAATACAAGCAGGAGTGGATAGATTCTACAGGACCTGCAAGTTA +GTGGGAATCAACATGAGCAAAAAGAAGTCCTATATAAATAAAACAGGGACATTTGAATTCACAAGCTTTTTTTATCGATA +TGGATTTGTGGCTAATTTTAGCATGGAGCTTCCCAGTTTTGGAGTGTCTGGAATAAACGAGTCAGCTGATATGAGTATTG +GAGTAACAGTGATAAAGAACAACATGATAAACAATGACCTTGGGCCAGCAACAGCCCAGATGGCTCTCCAATTGTTCATC +AAAGACTACAGATATACATATAGGTGCCATAGAGGAGACACACAAATTCAGACGAGAAGATCATTCGAGCTAAAGAAGCT +GTGGGATCAAACCCAATCAAGGGCAGGACTATTGGTATCAGATGGGGGACCAAACTTATACAATATCCGGAACCTTCACA +TCCCTGAAGTCTGCTTAAAGTGGGAGCTAATGGATGAGAATTATCGGGGAAGACTTTGTAACCCCCTGAATCCCTTTGTC +AGCCATAAAGAAATTGAGTCTGTAAACAATGCTGTAGTGATGCCAGCCCACGGTCCAGCCAAAAGTATGGAATATGATGC +CGTTGCAACTACACACTCCTGGAATCCCAAGAGGAACCGCTCTATTCTAAACACTAGCCAAAGGGGAATTCTTGAGGATG +AACAGATGTACCAAAAGTGCTGCAACTTGTTCGAGAAATTTTTCCCTAGTAGTTCATATAGGAGACCGATTGGAATTTCT +AGCATGGTGGAGGCCATGGTGTCTAGGGCCCGGATTGATGCCAGAATTGACTTCGAGTCTGGACGGATTAAGAAGGAAGA +GTTCTCTGAGATCATGAAGATCTGTTCCACCATTGAAGAACTCAGACGGCAAAAATAATGAATTTAGCTTGTCCTTCATG +AAAAAATGCCTTGTTTCTACT +>kraken:taxid|335341|NC_007371.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 3, complete sequence +AGCAAAAGCAGGTACTGATTCGAAATGGAAGATTTTGTGCGACAATGCTTCAACCCGATGATTGTCGAACTTGCAGAAAA +AGCAATGAAAGAGTATGGAGAGGATCTGAAAATTGAAACAAACAAATTTGCAGCAATATGCACCCACTTGGAGGTATGTT +TCATGTATTCAGATTTTCATTTCATCAATGAACAAGGCGAATCAATAGTGGTAGAACTTGATGATCCAAATGCACTGTTA +AAGCACAGATTTGAAATAATCGAGGGGAGAGACAGAACAATGGCCTGGACAGTAGTAAACAGTATCTGCAACACTACTGG +AGCAGAAAAACCAAAGTTTCTACCAGATTTGTATGATTACAAGGAGAATAGATTCATCGAAATTGGAGTGACAAGAAGAG +AAGTCCACATATATTACCTTGAAAAGGCCAATAAAATTAAATCTGAGAACACACACATTCACATCTTCTCATTCACTGGG +GAGGAAATAGCCACAAAGGCAGACTACACTCTCGACGAGGAAAGCAGGGCTAGGATTAAAACCAGGCTATTTACCATAAG +ACAAGAAATGGCCAACAGAGGCCTCTGGGATTCCTTTCGTCAGTCCGAAAGAGGCGAAGAAACAATTGAAGAAAAATTTG +AAATCTCAGGAACTATGCGTAGGCTTGCCGACCAAAGTCTCCCACCGAAATTCTCCTGCCTTGAGAATTTTAGAGCCTAT +GTGGATGGATTCGAACCGAACGGCTGCATTGAGGGCAAGCTTTCTCAAATGTCCAAAGAAGTGAATGCCAAAATTGAACC +TTTTCTGAAGACAACACCAAGACCAATCAAACTTCCTAATGGACCTCCTTGTTATCAGCGGTCCAAATTCCTCCTGATGG +ATGCTTTGAAATTGAGCATTGAAGACCCAAGTCATGAAGGAGAAGGGATTCCATTATATGATGCGATCAAGTGCATAAAA +ACATTCTTTGGATGGAAAGAACCTTATATAGTCAAACCACACGAAAAGGGAATAAATTCAAATTACCTGCTGTCATGGAA +GCAAGTATTGTCAGAATTGCAGGACATTGAAAATGAGGAGAAGATCCCAAGGACTAAAAACATGAAGAAAACGAGTCAAC +TAAAGTGGGCTCTTGGTGAAAACATGGCACCAGAGAAAGTAGACTTTGACAACTGCAGAGACATAAGCGATTTGAAGCAA +TATGATAGTGACGAACCTGAATTAAGGTCACTTTCAAGCTGGATACAGAATGAGTTCAACAAGGCCTGCGAGCTAACTGA +TTCAATCTGGATAGAGCTCGATGAAATTGGAGAGGACGTAGCCCCAATTGAGTACATTGCAAGCATGAGGAGGAATTATT +TCACAGCAGAGGTGTCCCATTGTAGAGCCACTGAGTACATAATGAAGGGGGTATACATTAATACTGCCCTGCTCAATGCA +TCCTGTGCAGCAATGGACGATTTTCAACTAATTCCCATGATAAGCAAGTGCAGAACTAAAGAGGGAAGGCGAAAAACCAA +TTTATATGGATTCATCATAAAGGGAAGATCTCATTTAAGGAATGACACAGATGTGGTAAACTTTGTGAGCATGGAGTTTT +CTCTCACTGACCCGAGACTTGAGCCACATAAATGGGAGAAATACTGTGTCCTTGAGATAGGAGATATGTTACTAAGAAGT +GCCATAGGCCAAATTTCAAGGCCTATGTTCTTGTATGTGAGGACAAACGGAACATCAAAGGTCAAAATGAAATGGGGAAT +GGAGATGAGACGTTGCCTCCTTCAGTCACTCCAGCAGATCGAGAGCATGATTGAAGCCGAGTCCTCGATTAAAGAGAAAG +ACATGACCAAAGAGTTTTTTGAGAATAAATCAGAAGCATGGCCCATTGGGGAGTCCCCCAAGGGAGTGGAAGAAGGTTCC +ATTGGGAAAGTCTGTAGGACTCTATTGGCTAAGTCAGTGTTCAATAGCCTGTATGCATCACCACAATTGGAAGGATTTTC +AGCGGAGTCAAGAAAACTGCTTCTTGTTGTTCAGGCTCTTAGGGACAACCTCGAACCTGGGACCTTTGATCTCGGGGGGC +TATATGAAGCAATTGAGGAGTGCCTGATTAATGATCCCTGGGTTTTGCTCAATGCATCTTGGTTCAACTCCTTCCTGACA +CATGCATTAAAATAGTTATGGCAGTGCTACTATTTGTTATCCGTACTGTCCAAAAAAGTACCTTGTTTCTACT +>kraken:taxid|335341|NC_007366.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 4, complete sequence +AGCAAAAGCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAA +AAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAAC +AATCACGAATGACCAAATTGAAGTCACTAATGCTACTGAACTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTC +CTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAAT +AAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCT +TAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAA +CAAGCTCTGCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATAC +CCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAACTGTACATTTGGGGGGTTCACCACCCGGGTACGGA +CAATGACCAAATCAGCCTATATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAATCC +CGAGTATCGGATCTAGACCCAGGATAAGGGATGTCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGAC +ATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAAT +GAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTC +AAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGA +AATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGA +CGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAACAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCA +ACCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACAAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAA +GTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCT +TGTGGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAAC +TGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGGTCAATC +AGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGTTGAA +GTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCA +TCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGAGTGCATTAATTAAAAACACCCTTGTTTCTA +CT +>kraken:taxid|335341|NC_007369.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 5, complete sequence +AGCAAAAGCAGGGTTAATAATCACTCACCGAGTGACATCAAAATCATGGCGTCCCAAGGCACCAAACGGTCTTATGAACA +GATGGAAACTGATGGGGATCGCCAGAATGCAACTGAGATTAGGGCATCCGTCGGGAAGATGATTGATGGAATTGGGAGAT +TCTACATCCAAATGTGCACTGAACTTAAACTCAGTGATCATGAAGGGCGGTTGATCCAGAACAGCTTGACAATAGAGAAA +ATGGTGCTCTCTGCTTTTGATGAAAGAAGGAATAAATACCTGGAAGAACACCCCAGCGCGGGGAAAGATCCCAAGAAAAC +TGGGGGGCCCATATACAGGAGAGTAGATGGAAAATGGATGAGGGAACTCGTCCTTTATGACAAAGAAGAGATAAGGCGAA +TCTGGCGCCAAGCCAACAATGGTGAGGATGCGACAGCTGGTCTAACTCACATAATGATCTGGCATTCCAATTTGAATGAT +GCAACATACCAGAGGACAAGAGCTCTTGTTCGAACTGGAATGGATCCCAGAATGTGCTCTCTGATGCAGGGCTCGACTCT +CCCTAGAAGGTCCGGAGCTGCAGGTGCTGCAGTCAAAGGAATCGGGACAATGGTGATGGAACTGATCAGAATGGTCAAAC +GGGGGATCAACGATCGAAATTTCTGGAGAGGTGAGAATGGGCGGAAAACAAGAAGTGCTTATGAGAGAATGTGCAACATT +CTTAAAGGAAAATTTCAAACAGCTGCACAAAGAGCAATGGTGGATCAAGTGAGAGAAAGTCGGAACCCAGGAAATGCTGA +GATCGAAGATCTCATATTTTTGGCAAGATCTGCATTGATATTGAGAGGGTCAGTTGCTCACAAATCTTGCCTACCTGCCT +GTGCGTATGGACCTGCAGTATCCAGTGGGTACGACTTCGAAAAAGAGGGATATTCCTTGGTGGGAATAGACCCTTTCAAA +CTACTTCAAAATAGCCAAATATACAGCCTAATCAGACCTAACGAGAATCCAGCACACAAGAGTCAGCTGGTGTGGATGGC +ATGCCATTCTGCTGCATTTGAAGATTTAAGATTGTTAAGCTTCATCAGAGGGACAAAAGTATCTCCGCGGGGGAAACTGT +CAACTAGAGGAGTACAAATTGCTTCAAATGAGAACATGGATAATATGGGATCGAGCACTCTTGAACTGAGAAGCGGGTAC +TGGGCCATAAGGACCAGGAGTGGAGGAAACACTAATCAACAGAGGGCCTCCGCAGGCCAAACCAGTGTGCAACCTACGTT +TTCTGTACAAAGAAACCTCCCATTTGAAAAGTCAACCATCATGGCAGCATTCACTGGAAATACGGAGGGAAGGACTTCAG +ACATGAGGGCAGAAATCATAAGAATGATGGAAGGTGCAAAACCAGAAGAAGTGTCATTCCGGGGGAGGGGAGTTTTCGAG +CTCTCAGACGAGAAGGCAACGAACCCGATCGTGCCCTCTTTTGATATGAGTAATGAAGGATCTTATTTCTTCGGAGACAA +TGCAGAAGAGTACGACAATTAAGGAAAAAATACCCTTGTTTCTACT +>kraken:taxid|335341|NC_007368.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 6, complete sequence +AGCAAAAGCAGGAGTAAAGATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATAT +GCTTCTTCATGCAAATTGCCATCCTGATAACCACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAAC +AACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGA +GAAGGAAATGTGCCCCAAACTAGCAGAATACAGAAATTGGTCAAAGCCGCAATGTGACATTACAGGATTTGCACCTTTTT +CTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGACCCTGAC +AAGTGTTACCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATGACACAGTACATGATAGGACCCC +TTATCGGACCCTATTGATGAATGAATTAGGTGTTCCATTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCT +CAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTGTAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTAC +AATGGGAGGCTTGTAGATAGTATTGTTTCATGGTCCAAAAAAATCCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAA +TGGAACTTGTACAGTAGTAATGACTGATGGGAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGA +AAATCATTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAGGAGTGCTCCTGCTATCCTCGATATCCTGGTGTC +AGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATAGGCCCATCGTAGATATAAACATAAAGGATTATAGCATTGTTTC +CAGTTATGTGTGCTCAGGGCTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGCTTGGATCCTA +ACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACGATCAGC +GAGAAGTTACGCTCAGGATATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAAACCTAATTCCAAATTGCAGATAAATAG +GCAAGTCATAGTTGACAGAGGTAATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGT +GCTTTTATGTGGAGTTGATAAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGT +GGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACATCAATCTCATGCCTATATAAGCTTTCGCAAT +TTTAGAAAAAAACTCCTTGTTTCTACT +>kraken:taxid|335341|NC_007367.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 7, complete sequence +AGCAAAAGCAGGTAGATATTGAAAGATGAGCCTTCTAACCGAGGTCGAAACGTATGTTCTCTCTATCGTTCCATCAGGCC +CCCTCAAAGCCGAGATCGCGCAGAGACTTGAAGATGTCTTTGCTGGGAAAAACACAGATCTTGAGGCTCTCATGGAATGG +CTAAAGACAAGACCAATTCTGTCACCTCTGACTAAGGGGATTTTGGGGTTTGTGTTCACGCTCACCGTGCCCAGTGAGCG +AGGACTGCAGCGTAGACGCTTTGTCCAAAATGCCCTCAATGGGAATGGAGATCCAAATAACATGGACAAAGCAGTTAAAC +TGTATAGGAAACTTAAGAGGGAGATAACGTTCCATGGGGCCAAAGAAATAGCTCTCAGTTATTCTGCTGGTGCACTTGCC +AGTTGCATGGGCCTCATATACAATAGGATGGGGGCTGTAACCACTGAAGTGGCATTTGGCCTGGTATGTGCAACATGTGA +ACAGATTGCTGACTCCCAGCACAGGTCTCATAGGCAAATGGTGGCAACAACCAATCCATTAATAAAACATGAGAACAGAA +TGGTTTTGGCCAGCACTACAGCTAAGGCTATGGAGCAAATGGCTGGATCAAGTGAGCAGGCAGCGGAGGCCATGGAAATT +GCTAGTCAGGCCAGGCAAATGGTGCAGGCAATGAGAGCCGTTGGGACTCATCCTAGCTCCAGTACTGGTCTAAGAGATGA +TCTTCTTGAAAATTTGCAGACCTATCAGAAACGAATGGGGGTGCAGATGCAACGATTCAAGTGACCCGCTTGTTGTTGCC +GCGAGTATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCGTCTATCGACTCTTCAA +ACACGGCCTTAAAAGAGGCCCTTCTACGGAAGGAGTACCTGAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAATG +CTGTGGATGCTGACGACAGTCATTTTGTCAGCATAGAGTTGGAGTAAAAAACTACCTTGTTTCTACT +>kraken:taxid|335341|NC_007370.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 8, complete sequence +AGCAAAAGCAGGGTGACAAAGACATAATGGATTCCAACACTGTGTCAAGTTTCCAGGTAGATTGCTTTCTTTGGCATATC +CGGAAACAAGTTGTAGACCAAGAACTGAGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAGGTCCCTAAGGGG +AAGAGGCAATACTCTCGGTCTAGACATCAAAGCAGCCACCCATGTTGGAAAGCAAATTGTAGAAAAGATTCTGAAAGAAG +AATCTGATGAGGCACTTAAAATGACCATGGTCTCCACACCTGCTTCGCGATACATAACTGACATGACTATTGAGGAATTG +TCAAGAAACTGGTTCATGCTAATGCCCAAGCAGAAAGTGGAAGGACCTCTTTGCATCAGAATGGACCAGGCAATCATGGA +GAAAAACATCATGTTGAAAGCGAATTTCAGTGTGATTTTTGACCGACTAGAGACCATAGTATTACTAAGGGCTTTCACCG +AAGAGGGAGCAATTGTTGGCGAAATCTCACCATTGCCTTCTTTTCCAGGACATACTATTGAGGATGTCAAAAATGCAATT +GGGGTCCTCATCGGAGGACTTGAATGGAATGATAACACAGTTCGAGTCTCTAAAAATCTACAGAGATTCGCTTGGAGAAG +CAGTAATGAGAATGGGGGACCTCCACTTACTCCAAAACAGAAACGGAAAATGGCGAGAACAGCTAGGTCAAAAGTTTGAA +GAGATAAGATGGCTGATTGAAGAAGTGAGACACAGACTAAAAACAACTGAAAATAGCTTTGAACAAATAACATTCATGCA +AGCATTACAACTGCTGTTTGAAGTGGAACAGGAGATAAGAACTTTCTCATTTCAGCTTATTTAATGATAAAAAACACCCT +TGTTTCTACT diff --git a/data/FluB.fa b/data/FluB.fa new file mode 100644 index 0000000..24250dc --- /dev/null +++ b/data/FluB.fa @@ -0,0 +1,192 @@ +>kraken:taxid|518987|NC_002204.1 Influenza B virus RNA 1, complete sequence +AGCAGAAGCGGAGCTTTAAGATGAATATAAATCCATATTTTCTTTTCATAGATGTACCTATACAGGCAGCAATTTCAACA +ACATTCCCATACACCGGTGTTCCCCCTTATTCTCATGGAACGGGAACAGGCTACACAATAGACACCGTGATTAGAACACA +CGAGTACTCAAACAAGGGAAAACAATACATTTCTGATGTTACAGGATGTGTAATGGTAGATCCAACAAATGGGCCATTAC +CCGAAGACAATGAACCGAGTGCCTATGCACAATTGGATTGTGTTCTGGAGGCTTTGGATAGAATGGATGAAGAACATCCA +GGTCTGTTTCAAGCAGGGTCACAGAATGCCATGGAGGCACTAATGGTCACAACAGTGGACAAATTGACTCAGGGGAGACA +GACCTTTGATTGGACGGTGTGTAGAAACCAACCTGCTGCAACGGCACTGAACACAACAATAACCTCTTTTAGGTTGAATG +ATTTAAATGGAGCCGACAAGGGTGGATTAGTGCCCTTTTGCCAAGATATCATTGATTCATTAGACAAACCTGAAATGATT +TTCTTCACAGTAAAGAATATAAAGAAAAAATTGCCTGCTAAAAACAGAAAGGGTTTCCTTATAAAAAGAATACCTATGAA +GGTAAAAGACAGAATAACAAGAGTGGAATACATCAAAAGAGCATTATCATTAAACACAATGACTAAAGATGCTGAAAGAG +GCAAACTAAAAAGAAGAGCAATTGCCACCGCTGGGATACAAATCAGAGGATTTGTATTAGTAGTTGAAAACTTGGCTAAA +AATATCTGTGAAAATCTAGAGCAAAGTGGTTTACCCGTAGGTGGAAACGAAAAGAAGGCCAAACTATCAAATGCAGTGGC +TAAAATGCTCAGTAATTGTCCACCAGGAGGGATCAGTATGACTGTGACAGGAGACAATACTAAATGGAATGAATGCTTAA +ATCCAAGAATCTTTTTGGCTATGACTGAAAGAATAACCAGAGACAGCCCAATTTGGTTCCGGGATTTTTGTAGTATAGCA +CCGGTCTTGTTCTCCAATAAAATAGCTAGATTGGGAAAAGGGTTCATGATAACAAGTAAAACAAAAAGACTAAAAGCTCA +AATACCTTGTCCCGATCTGTTTAATATACCATTAGAAAGATATAATGAAGAAACAAGGGCAAAACTGAAAAAGCTAAAAC +CTTTCTTCAATGAAGAAGGAACGGCATCTCTTTCGCCAGGAATGATGATGGGAATGTTTAATATGCTATCTACAGTATTA +GGAGTAGCCGCACTAGGGATAAAAAACATTGGAAACAAAGAATACTTATGGGATGGACTGCAGTCTTCGGATGATTTTGC +TCTGTTTGTTAATGCAAAAGATGAAGAGACATGTATGGAAGGAATAAACGATTTTTACCGAACATGTAAGCTATTGGGAA +TAAACATGAGCAAAAAGAAAAGTTACTGTAATGAAACTGGGATGTTTGAATTTACCAGCATGTTTTACAGAGATGGATTT +GTATCTAATTTTGCAATGGAACTCCCTTCATTTGGAGTCGCTGGAGTGAATGAATCAGCAGACATGGCAATAGGAATGAC +AATAATAAAGAACAATATGATCAACAATGGGATGGGCCCAGCAACGGCACAAACAGCCATACAATTATTCATAGCTGACT +ATAGATACACCTACAAATGCCACAGGGGAGATTCCAAAGTGGAAGGGAAGAGAATGAAAATTATAAAGGAGCTATGGGAA +AACACTAAAGGAAGAGATGGTCTATTAGTAGCAGATGGTGGGCCTAATCTTTACAATTTGAGAAACCTGCATATTCCAGA +AATAATATTAAAATACAACATAATGGACCCTGAGTACAAAGGACGGTTACTGCATCCTCAAAATCCCTTTGTAGGACATT +TGTCTATTGAGGGTATCAAAGAAGCAGATATAACACCTGCACATGGCCCAATAAAGAAAATGGACTACGATGCGGTATCT +GGAACTCATAGTTGGAGAACCAAAAGGAACAGATCTATACTAAACACTGATCAGAGGAACATGATTCTTGAGGAACAATG +CTACGCTAAGTGTTGCAACCTTTTTGAGGCTTGCTTTAACAGTGCGTCATACAGGAAACCAGTAGGCCAGCACAGCATGC +TTGAAGCTATGGCCCACAGATTAAGAATGGATGCACGACTGGACTATGAGTCAGGAAGGATGTCAAAAGAGGATTTCGAA +AAAGCAATGGCTCACCTTGGTGAGATTGGGTACATGTAAGCTCCGGAAATGTCTATGGGGTTATTGGTCATCGTTGAATA +CATGCGGTGCACAAATGATTAAAATGAAAAAAGGCTCGTGTTTCTACT +>kraken:taxid|518987|NC_002205.1 Influenza B virus (B/Lee/1940) segment 2, complete sequence +ATGACGTTGGCTAAAATTGAACTACTAAAGCAGCTGTTAAGGGACAATGAAGCCAAAACGGTGTTGAGACAGACAACGGT +AGACCAATACAACATAATAAGAAAATTCAATACATCAAGAATTGAAAAGAACCCTTCATTAAGAATGAAGTGGGCCATGT +GTTCCAATTTTCCCTTAGCTCTGACCAAGGGTGATATGGCAAATCGAATCCCCTTGGAATACAAGGGAATACAACTTAAA +ACAAATGCTGAAGACATAGGAACTAAAGGACAAATGTGTTCAATAGCAGCAGTTACCTGGTGGAATACATATGGGCCCAT +AGGGGATACTGAAGGGTTTGAAAAGGTCTACGAAAGCTTTTTTCTCAGAAAGATGAGACTTGACAATGCCACTTGGGGCC +GAATAACCTTTGGCCCTGTTGAGAGAGTAAGAAAAAGAGTACTACTAAACCCGCTCACCAAGGAAATGCCCCCAGATGAA +GCGAGCAATGTAATAATGGAAATATTATTCCCTAAAGAAGCAGGAATACCAAGAGAATCTACTTGGATACATAGAGAACT +GATAAAAGAAAAAAGAGAAAAATTGAAGGGAACGATGATAACTCCCATTGTACTGGCATACATGCTTGAGAGAGAACTAG +TTGCCCGAAGAAGGTTCCTGCCAGTAGCAGGAGCAACATCAGCAGAGTTCATAGAAATGCTACATTGCTTACAAGGTGAA +AATTGGAGACAAATATATCATCCAGGAGGGAATAAACTAACTGAATCTAGATCTCAATCAATGATTGTAGCTTGCAGGAA +GATAATCAGAAGATCAATAGTTGCATCAAACCCACTAGAGCTAGCTGTAGAGATTGCAAATAAGACTGTGATAGACACTG +AACCTTTAAAATCATGTCTGGCAGCCCTGGATGGAGGTGATGTAGCCTGTGACATAATAAGAGCTGCATTAGGATTAAAA +ATTAGACAAAGACAAAGATTTGGGAGACTTGAACTAAAGAGAATATCAGGAAGAGGATTCAAAAATGATGAAGAGATATT +AATCGGAAACGGAACAATACAAAAGATTGGAATATGGGACGGAGAAGAGGAATTCCATGTAAGATGTGGCGAATGCAGGG +GGATATTGAAAAAAAGCCAAATGAGAATGGAAAAACTACTGATAAATTCAGCCAAAAAGGAGGACATGAAAGATTTAATA +ATCTTATGCATGGTATTTTCTCAAGACACTAGGATGTTCCAAGGAGTGAGAGGAGAGATAAATTTTCTTAATCGAGCAGG +CCAACTTTTATCCCCCATGTACCAACTCCAACGATACTTTCTGAATAGGAGCAATGACCTTTTTGATCAATGGGGATATG +AGGAATCACCTAAAGCAAGTGAGCTACATGGGATAAATGAATTAATGAATGCATCTGACTATACATTGAAAGGGGTTGTA +GTAACAAAAAATGTGATTGATGATTTTAGTTCTACTGAAACAGAAAAAGTATCTATAACAAAAAATCTTAGTTTAATAAA +AAGGACTGGGGAAGTTATAATGGGAGCCAATGACGTAAGTGAATTAGAATCACAAGCACAGCTAATGATAACGTATGATA +CACCCAAGATGTGGGAAATGGGAACAACCAAAGAACTGGTACAAAACACTTACCAATGGGTGCTTAAAAATTTAGTAACA +TTGAAGGCTCAGTTTCTTTTGGGAAAAGAAGACATGTTCCAATGGGATGCATTTGAAGCATTTGAAAGCATAATCCCTCA +GAAGATGGCTGGTCAGTACAGTGGATTTGCAAGAGCAGTGCTCAAACAAATGAGAGACCAAGAGGTTATGAAAACTGACC +AATTCATAAAATTGTTGCCTTTCTGTTTTTCGCCACCAAAATTAAGGAGCAATGGAGAGCCTTATCAATTTTTGAGGCTT +ATGCTGAAAGGAGGAGGGGAAAATTTCATCGAAGTAAGGAAAGGGTCCCCCTTGTTCTCCTACAATCCACAAACGGAAAT +CCTAACTATATGCGGCAGAATGATGTCATTAAAAGGAAAAATTGAGGATGAAGAAAGAAATAGATCAATGGGGAATGCAG +TACTGGCAGGCTTTCTTGTTAGTGGCAAATATGACCCTGATCTTGGAGATTTCAAAACCATTGAGGAACTTGAAAGACTA +AAACCGGGAGAAAAAGCCAACATCTTACTTTACCAAGGAAAGCCCGTTAAAGTAGTTAAAAGGAAAAGATATAGTGCTTT +ATCCAATGATATTTCACAAGGGATTAAGAGACAAAGAATGACAGTTGAGTCCATGGGGTGGGCCTTGAGCTAA +>kraken:taxid|518987|NC_002206.1 Influenza B virus (B/Lee/1940) segment 3, complete sequence +ATGGATACTTTTATTACAAAGAATTTCCAGACTACAATAATACAAAAGGCCAAAAACACAATGGCAGAATTTAGTGAAGA +TCCTGAATTACAGCCAGCAGTACTATTCAACATCTGCGTCCATCTGGAGGTCTGCTATGTAATAAGTGATATGAACTTTC +TTGATGAGGAAGGAAAGACATATACAGCATTAGAAGGACAAGGAAAAGAGCAAAATTTGAGACCACAGTATGAAGTGATT +GAGGGAATGCCAAGAAACATAGCATGGATGGTTCAAAGATCCTTAGCCCAAGAGCATGGAATAGAGACTCCAAGGTATCT +GGCTGATTTATTTGATTATAAAACCAAGAGGTTTATCGAAGTCGGAATAACAAAGGGATTGGCTGATGATTACTTTTGGA +AAAAGAAAGAAAAGTTGGGGAATAGCATGGAACTGATGATATTCAGCTACAATCAAGACTACTCGTTAAGTGATGAATCT +TCATTGGATGAGGAAGGAAAAGGGAGAGTGCTAAGCAGACTCACAGAACTTCAGGCTGAGTTAAGTTTGAAAAACCTATG +GCAAGTTCTAATAGGGGAAGAAGAAATTGAAAAAGGAATTGACTTCAAACTTGGACAAACAATATCTAAACTGAGGAATA +TATCTGTTCCAGCTGGTTTCTCCAATTTTGAAGGGATGAGAAGTTACATAGACAACATAGACCCTAAAGGAGCAATAGAG +AGAAATCTAGCAAGGATGTCTCCCTTAGTATCAGTTACACCCAAAAAGTTGAAATGGGAGGACCTGAGACCCATAGGGCC +TCACATTTACAACCATGAGCTACCAGAAGTTCCATATAATGCCTTTCTCCTCATGTCTGATGAGTTGGGGCTGGCCAATA +TGACTGAAGGAAAGTCCAAGAAACCGAAGACCTTAGCTAAGGAATGTCTAGAAAGGTATTCAACACTACGTGATCAAACT +GACCCAATATTGATAATGAAAAGCGAAAAAGCTAACGAAAACTTCTTATGGAGGTTATGGAGGGACTGTGTAAATACAAT +AAGCAATGAGGAAACAGGCAACGAATTACAGAAAACCAATTATGCCAAGTGGGCCACAGGAGATGGACTAACATACCAAA +AAATAATGAAAGAAGTAGCAATAGATGACGAAACGATGTACCAAGAAGAACCCAAAATACCCAATAAATGTAGAGTGGCT +GCTTGGGTTCAGGCAGAGATGAATCTACTGAGTACTCTGACAAGTAAAAGGGCCCTGGATCTGCCAGAAATAGGGCCAGA +TGTAGCACCCGTGGAGCATGTAGGGAGTGAAAGAAGGAAATACTTTGTTAATGAAATCAACTACTGTAAAGCCTCTACAG +TTATGATGAAGTATGTACTTTTTCACACTTCATTATTAAATGAAAGCAATGCTAGTATGGGAAAATATAAAGTAATACCA +ATCACCAACAGAGTGGTAAATGAAAAAGGGGAAAGCTTTGACATGCTTTATGGTCTGGCGGTTAAGGGGCAATCTCATTT +GCGGGGGGACACGGATGTTGTAACAGTTGTGACTTTCGAGTTTAGTAGTACAGATCCTAGAGTGGACTCAGGAAAGTGGC +CAAAATATACTGTCTTTAAAATTGGCTCCCTATTTGTGAGTGGAAGAGAAAAACCTGTGTACCTATATTGCCGAGTGAAT +GGTACAAACAAAATCCAAATGAAATGGGGAATGGAAGCTAGAAGATGTCTGCTTCAATCAATGCAACAAATGGAGGCAAT +TGTTGATCAAGAATCATCGATACAAGGGTATGATATGACCAAAGCTTGTTTCAAGGGAGACAGAGTGAATAATCCCAAAA +CTTTCAGTATTGGGACTCAGGAAGGCAAACTAGTAAAAGGGTCCTTTGGGAAAGCACTAAGAGTAATATTCACCAAATGT +TTGATGCATTATGTATTTGGAAATGCTCAATTGGAGGGGTTTAGTGCCGAATCTAGGAGACTTCTACTGTTAATTCAGGC +ATTAAAAGACAGGAAGGGCCCTTGGGTATTTGACTTGGAGGGAATGTACTTTGGAGTAGAGGAATGTATTAGTAACAATC +CTTGGGTAATACAGAGTGCATACTGGTTTAATGAATGGTTGGGCATTGAAAAAGAAGGAAGTAAAGTGTTAGAATCAATA +GATGAAATAATGGATGAATGAACGAAGGGCATAGCGCTCAATTT +>kraken:taxid|518987|NC_002207.1 Influenza B virus (B/Lee/1940) segment 4, complete sequence +AGCAGAAGCGTTGCATTTTCTAATATCCACAAAATGAAGGCAATAATTGTACTACTCATGGTAGTAACATCCAATGCAGA +TCGAATCTGCACTGGGATAACATCGTCAAACTCACCTCATGTGGTTAAAACTGCCACTCAAGGGGAAGTCAATGTGACTG +GTGTGATACCACTAACAACAACACCTACCAAATCTCATTTTGCAAATCTCAAAGGAACACAGACCAGAGGAAAACTATGC +CCAAACTGTTTTAACTGCACAGATCTGGACGTGGCCCTAGGCAGACCAAAATGCATGGGGAACACACCCTCCGCAAAAGT +CTCAATACTCCATGAAGTCAAACCTGCTACATCTGGATGCTTTCCTATAATGCACGACAGAACAAAAATCAGACAACTAC +CTAATCTTCTCAGAGGATATGAAAACATCAGGTTATCAACCAGTAATGTTATCAATACAGAGACGGCACCAGGAGGACCC +TACAAGGTGGGGACCTCAGGATCTTGCCCTAACGTTGCTAATGGGAACGGCTTCTTCAACACAATGGCTTGGGTTATCCC +AAAAGACAACAACAAGACAGCAATAAATCCAGTAACAGTAGAAGTACCATACATTTGTTCAGAAGGGGAAGACCAAATTA +CTGTTTGGGGGTTCCACTCTGATGACAAAACCCAAATGGAAAGACTCTATGGAGACTCAAATCCTCAAAAGTTCACCTCA +TCTGCCAATGGAGTAACCACACATTATGTTTCTCAGATTGGTGGCTTCCCAAATCAAACAGAAGACGAAGGGCTAAAACA +AAGCGGCAGAATTGTTGTTGATTACATGGTACAAAAACCTGGAAAAACAGGAACAATTGTTTATCAAAGAGGCATTTTAT +TGCCTCAAAAAGTGTGGTGCGCAAGTGGCAGGAGCAAGGTAATAAAAGGGTCCTTGCCTTTAATTGGTGAAGCAGATTGC +CTCCACGAAAAGTACGGTGGATTAAATAAAAGCAAGCCTTACTACACAGGAGAGCATGCAAAGGCCATAGGAAATTGCCC +AATATGGGTGAAAACACCCTTGAAGCTGGCCAATGGAACCAAATATAGACCGCCTGCAAAACTATTAAAGGAAAGAGGTT +TCTTCGGAGCTATTGCTGGTTTCTTGGAAGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATACACATCTCATGGA +GCACATGGAGTGGCAGTGGCAGCAGACCTTAAGAGTACACAAGAAGCTATAAACAAGATAACAAAAAATCTCAACTATTT +AAGTGAGCTAGAAGTAAAAAACCTTCAAAGACTAAGCGGAGCAATGAATGAGCTTCACGACGAAATACTCGAGCTAGACG +AAAAAGTGGATGATCTAAGAGCTGATACAATAAGCTCACAAATAGAGCTTGCAGTCTTGCTTTCCAACGAAGGGATAATA +AACAGTGAAGATGAGCATCTCTTGGCACTTGAAAGAAAACTGAAGAAAATGCTTGGCCCCTCTGCTGTAGAAATAGGGAA +TGGGTGCTTTGAAACCAAACACAAATGCAACCAGACTTGCCTAGACAGGATAGCTGCTGGCACCTTTAATGCAGGAGATT +TTTCTCTTCCCACTTTTGATTCATTAAACATTACTGCTGCATCTTTAAATGATGATGGCTTGGATAATCATACTATACTG +CTCTACTACTCAACTGCTGCTTCTAGCTTGGCTGTAACATTAATGATAGCTATCTTCATTGTCTACATGGTCTCCAGAGA +CAATGTTTCTTGTTCCATCTGTCTGTGAGGGAGATTAAGCCCTGTGTTTTCCTTTACTGTAGTGCTCATTTGCTTGTCAC +CATTACAAAGAAACGTTATTGAAAAATGCTCTTGTTACTACT +>kraken:taxid|518987|NC_002208.1 Influenza B virus (B/Lee/1940) segment 5, complete sequence +GGCAGAAGCACAGCATTTTCTTGTGAGCTTCGAGCACTAATAAAACTGAAAATCAAAATGTCCAACATGGATATTGACAG +TATAAATACCGGAACAATCGATAAAACACCAGAAGAACTGACTCCCGGAACCAGTGGGGCAACCAGACCAATCATCAAGC +CAGCAACCCTTGCTCCGCCAAGCAACAAACGAACCCGAAATCCATCTCCAGAAAGGACAACCACAAGCAGTGAAACCGAT +ATCGGAAGGAAAATCCAAAAGAAACAAACCCCAACAGAGATAAAGAAGAGCGTCTACAAAATGGTGGTAAAACTGGGTGA +ATTCTACAACCAGATGATGGTCAAAGCTGGACTTAATGATGACATGGAAAGGAATCTAATTCAAAATGCACAAGCTGTGG +AGAGAATCCTATTGGCTGCAACTGATGACAAGAAAACTGAATACCAAAAGAAAAGGAATGCCAGAGATGTCAAAGAAGGG +AAGGAAGAAATAGACCACAACAAGACAGGAGGCACCTTTTATAAGATGGTAAGAGATGATAAAACCATCTACTTCAGCCC +TATAAAAATTACCTTTTTAAAAGAAGAGGTGAAAACAATGTACAAGACCACCATGGGGAGTGATGGTTTCAGTGGACTAA +ATCACATTATGATTGGACATTCACAGATGAACGATGTCTGTTTCCAAAGATCAAAGGGACTGAAAAGGGTTGGACTTGAC +CCTTCATTAATCAGTACTTTTGCCGGAAGCACACTACCCAGAAGATCAGGTACAACTGGTGTTGCAATCAAAGGAGGTGG +AACTTTAGTGGATGAAGCCATCCGATTTATAGGAAGAGCAATGGCAGACAGAGGGCTACTGAGAGACATCAAGGCCAAGA +CGGCCTATGAAAAGATTCTTCTGAATCTGAAAAACAAGTGCTCTGCGCCGCAACAAAAGGCTCTAGTTGATCAAGTGATC +GGAAGTAGGAACCCAGGGATTGCAGACATAGAAGACCTAACTCTGCTTGCCAGAAGCATGGTAGTTGTCAGACCCTCTGT +AGCGAGCAAAGTGGTGCTTCCCATAAGCATTTATGCTAAAATACCTCAACTAGGATTCAATACCGAAGAATACTCTATGG +TTGGGTATGAAGCCATGGCTCTTTATAATATGGCAACACCTGTTTCCATATTAAGAATGGGAGATGACGCAAAAGATAAA +TCTCAACTATTCTTCATGTCGTGCTTCGGAGCTGCCTATGAAGATCTAAGAGTGTTATCTGCACTAACGGGCACCGAATT +TAAGCCTAGATCAGCACTAAAATGCAAGGGTTTCCATGTCCCGGCTAAGGAGCAAGTAGAAGGAATGGGGGCAGCTCTGA +TGTCCATCAAGCTTCAGTTCTGGGCCCCAATGACCAGATCTGGAGGGAATGAAGTAAGTGGAGAAGGAGGGTCTGGTCAA +ATAAGTTGCAGCCCTGTGTTTGCAGTAGAAAGACCTATTGCTCTAAGCAAGCAAGCTGTAAGAAGAATGCTGTCAATGAA +CGTTGAAGGACGTGATGCAGATGTCAAAGGAAATCTACTCAAAATGATGAATGATTCAATGGCAAAGAAAACCAGTGGAA +ATGCTTTCATTGGGAAGAAAATGTTTCAAATATCAGACAAAAACAAAGTCAATCCCATTGAGATTCCAATTAAGCAGACC +ATCCCCAATTTCTTCTTTGGGAGGGACACAGCAGAGGATTATGATGACCTCGATTATTAAAGCAATAAAATAGACACTAT +GGCTGTGACTGTTTCAGTACGTTTGGGATGTGGGTGTTTACTCTTATTGAAATAAATGTAAAAAATGCTGTTGTTTCTAC +T +>kraken:taxid|518987|NC_002209.1 Influenza B virus (B/Lee/1940) segment 6, complete sequence +AGCAGAAGCAGAGCATATTCTTAGAACTGAAGTGAACAGGCCAAAAATGAACAATGCTACCTTCAACTGTACAAACATTA +ACCCTATTACTCACATCAGGGGGAGTATTATTATCACTATATGTGTCAGCCTCATTGTCATACTTATTGTATTCGGATGT +ATTGCTAAAATTTTCATCAACAAAAACAACTGCACCAACAATGTCATTAGAGTGCACAAACGCATCAAATGCCCAGACTG +TGAACCATTCTGCAACAAAAGAGATGACATTTCCACCCCCAGAGCCGGAGTGGACATACCCTCGTTTATCTTGCCAGGGC +TCAACCTTTCAGAAGGCACTCCTAATTAGCCCTCATAGGTTCGGAGAGATCAAAGGAAACTCAGCTCCCTTGATAATAAG +AGAACCTTTTGTTGCTTGTGGACCAAAAGAATGCAGACACTTTGCTCTGACCCATTATGCAGCTCAGCCGGGGGGATACT +ACAATGGAACAAGAAAGGACAGAAACAAGCTGAGGCATCTAGTATCAGTCAAATTGGGAAAAATCCCAACTGTGGAAAAC +TCCATTTTCCACATGGCAGCTTGGAGCGGATCCGCATGCCATGATGGTAGAGAATGGACATATATCGGAGTTGATGGTCC +TGACAATGATGCATTGGTCAAAATAAAATATGGAGAAGCATATACTGACACATATCATTCCTATGCACACAACATCCTAA +GAACACAAGAAAGTGCCTGCAATTGCATCGGGGGAGATTGTTATCTTATGATAACAGACGGCTCAGCTTCAGGAATTAGT +AAATGCAGATTTCTTAAAATTAGAGAGGGTCGAATAATAAAAGAAATACTTCCAACAGGAAGAGTGGAGCACACTGAAGA +GTGCACATGCGGGTTCGCCAGCAATAAAACCATAGAATGTGCCTGTAGAGACAACAGTTACACAGCAAAAAGACCCTTTG +TCAAATTAAATGTGGAAACTGATACAGCTGAAATAAGATTGATGTGCACAAAGACTTATCTAGACACTCCCAGACCGGAT +GATGGAAGCATAGCAGGGCCTTGCGAATCTAATGGAGACAAGTGGCTTGGAGGCATCAAAGGAGGATTCGTCCATCAAAG +AATGGCATCTAAGATTGGAAGATGGTACTCCCGAACGATGTCTAAAACTAACAGAATGGGGATGGAACTGTATGTAAAGT +ATGATGGTGACCCATGGACTGACAGTGATGCTCTTACTCTTAGTGGAGTAATGGTTTCCATAGAAGAACCTGGTTGGTAT +TCTTTTGGCTTCGAAATAAAGGACAAGAAATGTGATGTCCCTTGTATTGGGATAGAGATGGTACACGATGGTGGAAAAGA +TACTTGGCATTCAGCTGCAACAGCCATTTACTGTTTGATGGGCTCAGGACAATTGCTATGGGACACTGTCACAGGCGTTG +ATATGGCTTTATAATAGAGGAATGGTTGGATCTGTTCTAAACCCTTTGTTCCTATTTTATTTGAACAGTTGTTCTTACTA +GATTTAATTGTTTCTGAAAAATGCTCTTGTTACTACT +>kraken:taxid|518987|NC_002210.1 Influenza B virus (B/Lee/1940) segment 7, complete sequence +AGCAGAAGCACGCACTTTCTTAAAATGTCGCTGTTTGGAGACACAATTGCCTACCTGCTTTCACTAATAGAAGATGGAGA +AGGCAAAGCAGAACTAGCTGAAAAATTACACTGTTGGTTCGGTGGGAAAGAATTTGACCTAGATTCTGCTTTGGAATGGA +TAAAAAACAAAAGGTGCCTAACTGATATACAAAAAGCACTAATTGGTGCCTCTATATGCTTTTTAAAACCCAAAGACCAA +GAAAGAAAAAGGAGATTCATCACAGAGCCCCTGTCAGGAATGGGAACAACAGCAACAAAGAAGAAAGGCCTAATTCTAGC +TGAGAGAAAAATGAGAAGATGTGTAAGCTTTCATGAAGCATTTGAAATAGCAGAAGGCCACGAAAGCTCAGCATTACTAT +ATTGTCTTATGGTCATGTACCTAAACCCTGAAAACTATTCAATGCAAGTAAAACTAGGAACGCTCTGTGCTTTATGCGAG +AAACAAGCATCGCACTCGCATAGAGCCCATAGCAGAGCAGCAAGGTCTTCGGTACCTGGAGTAAGACGAGAAATGCAGAT +GGTTTCAGCTATGAACACAGCAAAGACAATGAATGGAATGGGAAAGGGAGAAGACGTCCAAAAACTAGCAGAAGAGCTGC +AAAACAACATTGGAGTGTTGAGATCTCTAGGAGCAAGTCAAAAGAATGGAGAAGGAATTGCCAAAGATGTAATGGAAGTG +CTAAAACAGAGCTCTATGGGAAATTCAGCTCTTGTGAGGAAATACTTATAATGCTCGAACCACTTCAGATTCTTTCAATT +TGTTCTTTCATTTTATCAGCTCTCCATTTCATGGCTTGGACAATAGGGCATTTGAATCAAATAAAAAGAGGGGTAAACTT +GAAAATACAAATAAGGAATCCAAATAAGGAGGCAATAAACAGAGAGGTGTCAATTCTGAGACACAATTACCAAAAGGAAA +TCCAAGCCAAAGAAACAATGAAGAAAATACTCTCTGACAACATGGAAGTATTGGGTGACCACATAGTAGTTGAAGGGCTT +TCAACTGATGAGATAATAAAAATGGGTGAAACAGTTTTGGAGGTGGAAGAATTGCAATGAGCCCAATTTTCACTGTATTT +CTTACTATGCATTTAAGCAAATTGTAATCAATGTCAGTGAATAAAACTGGAAAAAGTGCGTTGTTTCTACT +>kraken:taxid|518987|NC_002211.1 Influenza B virus (B/Lee/1940) segment 8, complete sequence +CGCAGAAGCAGAGGATTTATTTAGTCACTGGCAAACGGAAAGATGGCGGACAACATGACCACAACACAAATTGAGGTGGG +TCCGGGAGCAACCAATGCCACTATAAACTTTGAAGCAGGAATTCTGGAGTGCTATGAAAGGTTTTCATGGCAAAGAGCCC +TTGACTATCCTGGTCAAGACCGCCTACACAGACTAAAACGAAAATTAGAATCAAGAATAAAGACTCACAACAAGAGTGAG +CCTGAGAATAAAAGGATGTCTCTTGAAGAGAGAAAAGCAATTGGGGTAAAAATGATGAAAGTGCTTCTGTTTATGGATCC +CTCTGCTGGAATTGAAGGGTTTGAGCCATACTGTGTGAAAAATCCCTCAACTAGCAAATGTCCAAATTACGATTGGACCG +ATTACCCTCCAACCCCAGGAAAGTACCTTGATGACATAGAAGAAGAGCCGGAAAATGTCGATCACCCAATTGAGGTAGTA +TTAAGGGACATGAACAATAAAGATGCACGACAAAAGATAAAGGATGAAGTAAACACTCAGAAAGAGGGGAAATTCCGTTT +GACAATAAAAAGGGATATACGTAATGTGTTGTCCTTGAGAGTGTTGGTGAACGGAACCTTCCTCAAGCACCCTAATGGAG +ACAAGTCCTTATCAACTCTTCATAGATTGAATGCATATGACCAGAATGGAGGGCTTGTTGCTAAACTTGTTGCTACTGAT +GATCGGACAGTGGAGGATGAAAAAGATGGCCATCGGATCCTCAACTCACTCTTCGAGCGTTTTGATGAAGGACATTCAAA +GCCAATTCGAGCAGCTGAAACTGCGGTGGGAGTCTTATCCCAATTTGGTCAAGAGCACCGATTATCACCAGAAGAGGGAG +ACAATTAGACTGGCCACGGAAGAACTTTATCTCTTGAGTAAAAGAATTGATGATAGTATATTGTTCCACAAAACAGTAAT +AGCTAACAGCTCCATAATAGCTGACATGATTGTATCATTATCATTACTGGAAACATTGTATGAAATGAAGGATGTGGTTG +AAGTGTACAGCAGGCAGTGCTTATGAATGTAAAATAAAAATCCTCTTGTTACTACT diff --git a/data/HIV_1.fna b/data/HIV_1.fna new file mode 100644 index 0000000..6ed3f29 --- /dev/null +++ b/data/HIV_1.fna @@ -0,0 +1,116 @@ +>kraken:taxid|11676|NC_001802.1 Human immunodeficiency virus 1, complete genome +GGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGC +TTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTC +AGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAG +GACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAG +GCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAG +GCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTG +GCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTT +AGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGA +CAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCA +GCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGG +GTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACA +AGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAG +CTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGAC +ATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTA +TAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGAC +CAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAAT +TGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTAC +ACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCC +AAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGT +GGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCA +AATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTC +TTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAG +CAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAG +ATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAG +ATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTG +GACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGT +TGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAA +ACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAA +AAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTA +GATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAA +GAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTG +CATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGA +TCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCA +ATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATC +TGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCAT +CCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGG +GAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCAC +TAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACAT +GGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTA +TCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAA +CAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAG +GAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTT +AGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGG +AGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAG +AAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGC +ATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGG +AAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGA +ATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAAT +GGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAG +CCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTA +GCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTT +AAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCG +CCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAAT +AAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCAT +CCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATAC +AAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGG +AAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAG +AAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATT +AGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGA +AAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTC +TGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTA +GACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATT +AGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAG +CATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAG +AAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGA +CATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGC +CATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGA +CAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTT +GTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGC +AGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGT +ACATGTAATGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCA +TAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAA +GACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGA +TGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACC +ACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCAC +AGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGA +TGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAG +TGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTG +CTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAA +TAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTT +GAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGG +ACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCA +GTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACA +TCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATT +TGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAAC +AGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAA +ATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAA +TAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTA +TAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACA +GGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGA +CAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAA +GAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATG +GGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAG +GGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGG +AAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGG +AATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAA +TTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAG +ATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGA +GGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTT +TCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACA +GATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGC +TTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTG +GAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTG +AGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGC +TTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAA +TGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGC +AATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCA +GGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGC +TAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAAC +TACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGAT +AGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAG +TGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGC +TGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCG +AGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAG +CTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTC diff --git a/data/MERS.fa b/data/MERS.fa new file mode 100644 index 0000000..18de91c --- /dev/null +++ b/data/MERS.fa @@ -0,0 +1,378 @@ +>kraken:taxid|1335626|NC_019843.3 Middle East respiratory syndrome coronavirus, complete genome +GATTTAAGTGAATAGCTTGGCTATCTCACTTCCCCTCGTTCTCTTGCAGAACTTTGATTTTAACGAACTTAAATAAAAGC +CCTGTTGTTTAGCGTATCGTTGCACTTGTCTGGTGGGATTGTGGCATTAATTTGCCTGCTCATCTAGGCAGTGGACATAT +GCTCAACACTGGGTATAATTCTAATTGAATACTATTTTTCAGTTAGAGCGTCGTGTCTCTTGTACGTCTCGGTCACAATA +CACGGTTTCGTCCGGTGCGTGGCAATTCGGGGCACATCATGTCTTTCGTGGCTGGTGTGACCGCGCAAGGTGCGCGCGGT +ACGTATCGAGCAGCGCTCAACTCTGAAAAACATCAAGACCATGTGTCTCTAACTGTGCCACTCTGTGGTTCAGGAAACCT +GGTTGAAAAACTTTCACCATGGTTCATGGATGGCGAAAATGCCTATGAAGTGGTGAAGGCCATGTTACTTAAAAAGGAGC +CACTTCTCTATGTGCCCATCCGGCTGGCTGGACACACTAGACACCTCCCAGGTCCTCGTGTGTACCTGGTTGAGAGGCTC +ATTGCTTGTGAAAATCCATTCATGGTTAACCAATTGGCTTATAGCTCTAGTGCAAATGGCAGCCTGGTTGGCACAACTTT +GCAGGGCAAGCCTATTGGTATGTTCTTCCCTTATGACATCGAACTTGTCACAGGAAAGCAAAATATTCTCCTGCGCAAGT +ATGGCCGTGGTGGTTATCACTACACCCCATTCCACTATGAGCGAGACAACACCTCTTGCCCTGAGTGGATGGACGATTTT +GAGGCGGATCCTAAAGGCAAATATGCCCAGAATCTGCTTAAGAAGTTGATTGGCGGTGATGTCACTCCAGTTGACCAATA +CATGTGTGGCGTTGATGGAAAACCCATTAGTGCCTACGCATTTTTAATGGCCAAGGATGGAATAACCAAACTGGCTGATG +TTGAAGCGGACGTCGCAGCACGTGCTGATGACGAAGGCTTCATCACATTAAAGAACAATCTATATAGATTGGTTTGGCAT +GTTGAGCGTAAAGACGTTCCATATCCTAAGCAATCTATTTTTACTATTAATAGTGTGGTCCAAAAGGATGGTGTTGAAAA +CACTCCTCCTCACTATTTTACTCTTGGATGCAAAATTTTAACGCTCACCCCACGCAACAAGTGGAGTGGCGTTTCTGACT +TGTCCCTCAAACAAAAACTCCTTTACACCTTCTATGGTAAGGAGTCACTTGAGAACCCAACCTACATTTACCACTCCGCA +TTCATTGAGTGTGGAAGTTGTGGTAATGATTCCTGGCTTACAGGGAATGCTATCCAAGGGTTTGCCTGTGGATGTGGGGC +ATCATATACAGCTAATGATGTCGAAGTCCAATCATCTGGCATGATTAAGCCAAATGCTCTTCTTTGTGCTACTTGCCCCT +TTGCTAAGGGTGATAGCTGTTCTTCTAATTGCAAACATTCAGTTGCTCAGTTGGTTAGTTACCTTTCTGAACGCTGTAAT +GTTATTGCTGATTCTAAGTCCTTCACACTTATCTTTGGTGGCGTAGCTTACGCCTACTTTGGATGTGAGGAAGGTACTAT +GTACTTTGTGCCTAGAGCTAAGTCTGTTGTCTCAAGGATTGGAGACTCCATCTTTACAGGCTGTACTGGCTCTTGGAACA +AGGTCACTCAAATTGCTAACATGTTCTTGGAACAGACTCAGCATTCCCTTAACTTTGTGGGAGAGTTCGTTGTCAACGAT +GTTGTCCTCGCAATTCTCTCTGGAACCACAACTAATGTTGACAAAATACGCCAGCTTCTCAAAGGTGTCACCCTTGACAA +GTTGCGTGATTATTTAGCTGACTATGACGTAGCAGTCACTGCCGGCCCATTCATGGATAATGCTATTAATGTTGGTGGTA +CAGGATTACAGTATGCCGCCATTACTGCACCTTATGTAGTTCTCACTGGCTTAGGTGAGTCCTTTAAGAAAGTTGCAACC +ATACCGTATAAGGTTTGCAACTCTGTTAAGGATACTCTGGCTTATTATGCTCACAGCGTGTTGTACAGAGTTTTTCCTTA +TGACATGGATTCTGGTGTGTCATCCTTTAGTGAACTACTTTTTGATTGCGTTGATCTTTCAGTAGCTTCTACCTATTTTT +TAGTCCGCATCTTGCAAGATAAGACTGGCGACTTTATGTCTACAATTATTACTTCCTGCCAAACTGCTGTTAGTAAGCTT +CTAGATACATGTTTTGAAGCTACAGAAGCAACATTTAACTTCTTGTTAGATTTGGCAGGATTGTTCAGAATCTTTCTCCG +CAATGCCTATGTGTACACTTCACAAGGGTTTGTGGTGGTCAATGGCAAAGTTTCTACACTTGTCAAACAAGTGTTAGACT +TGCTTAATAAGGGTATGCAACTTTTGCATACAAAGGTCTCCTGGGCTGGTTCTAAAATCATTGCTGTTATCTACAGCGGC +AGGGAGTCTCTAATATTCCCATCGGGAACCTATTACTGTGTCACCACTAAGGCTAAGTCCGTTCAACAAGATCTTGACGT +TATTTTGCCTGGTGAGTTTTCCAAGAAGCAGTTAGGACTGCTCCAACCTACTGACAATTCTACAACTGTTAGTGTTACTG +TATCCAGTAACATGGTTGAAACTGTTGTGGGTCAACTTGAGCAAACTAATATGCATAGTCCTGATGTTATAGTAGGTGAC +TATGTCATTATTAGTGAAAAATTGTTTGTGCGTAGTAAGGAAGAAGACGGATTTGCCTTCTACCCTGCTTGCACTAATGG +TCATGCTGTACCGACTCTCTTTAGACTTAAGGGAGGTGCACCTGTAAAAAAAGTAGCCTTTGGCGGTGATCAAGTACATG +AGGTTGCTGCTGTAAGAAGTGTTACTGTCGAGTACAACATTCATGCTGTATTAGACACACTACTTGCTTCTTCTAGTCTT +AGAACCTTTGTTGTAGATAAGTCTTTGTCAATTGAGGAGTTTGCTGACGTAGTAAAGGAACAAGTCTCAGACTTGCTTGT +TAAATTACTGCGTGGAATGCCGATTCCAGATTTTGATTTAGACGATTTTATTGACGCACCATGCTATTGCTTTAACGCTG +AGGGTGATGCATCCTGGTCTTCTACTATGATCTTCTCTCTTCACCCCGTCGAGTGTGACGAGGAGTGTTCTGAAGTAGAG +GCTTCAGATTTAGAAGAAGGTGAATCAGAGTGCATTTCTGAGACTTCAACTGAACAAGTTGACGTTTCTCATGAGACTTC +TGACGACGAGTGGGCTGCTGCAGTTGATGAAGCGTTCCCTCTCGATGAAGCAGAAGATGTTACTGAATCTGTGCAAGAAG +AAGCACAACCAGTAGAAGTACCTGTTGAAGATATTGCGCAGGTTGTCATAGCTGACACCTTACAGGAAACTCCTGTTGTG +CCTGATACTGTTGAAGTCCCACCGCAAGTGGTGAAACTTCCGTCTGCACCTCAGACTATCCAGCCCGAGGTAAAAGAAGT +TGCACCTGTCTATGAGGCTGATACCGAACAGACACAGAATGTTACTGTTAAACCTAAGAGGTTACGCAAAAAGCGTAATG +TTGACCCTTTGTCCAATTTTGAACATAAGGTTATTACAGAGTGCGTTACCATAGTTTTAGGTGACGCAATTCAAGTAGCC +AAGTGCTATGGGGAGTCTGTGTTAGTTAATGCTGCTAACACACATCTTAAGCATGGCGGTGGTATCGCTGGTGCTATTAA +TGCGGCTTCAAAAGGGGCTGTCCAAAAAGAGTCAGATGAGTATATTCTGGCTAAAGGGCCGTTACAAGTAGGAGATTCAG +TTCTCTTGCAAGGCCATTCTCTAGCTAAGAATATCCTGCATGTCGTAGGCCCAGATGCCCGCGCTAAACAGGATGTTTCT +CTCCTTAGTAAGTGCTATAAGGCTATGAATGCATATCCTCTTGTAGTCACTCCTCTTGTTTCAGCAGGCATATTTGGTGT +AAAACCAGCTGTGTCTTTTGATTATCTTATTAGGGAGGCTAAGACTAGAGTTTTAGTCGTCGTTAATTCCCAAGATGTCT +ATAAGAGTCTTACCATAGTTGACATTCCACAGAGTTTGACTTTTTCATATGATGGGTTACGTGGCGCAATACGTAAAGCT +AAAGATTATGGTTTTACTGTTTTTGTGTGCACAGACAACTCTGCTAACACTAAAGTTCTTAGGAACAAGGGTGTTGATTA +TACTAAGAAGTTTCTTACAGTTGACGGTGTGCAATATTATTGCTACACGTCTAAGGACACTTTAGATGATATCTTACAAC +AGGCTAATAAGTCTGTTGGTATTATATCTATGCCTTTGGGATATGTGTCTCATGGTTTAGACTTAATGCAAGCAGGGAGT +GTCGTGCGTAGAGTTAACGTGCCCTACGTGTGTCTCCTAGCTAATAAAGAGCAAGAAGCTATTTTGATGTCTGAAGACGT +TAAGTTAAACCCTTCAGAAGATTTTATAAAGCACGTCCGCACTAATGGTGGTTACAATTCTTGGCATTTAGTCGAGGGTG +AACTATTGGTGCAAGACTTACGCTTAAATAAGCTCCTGCATTGGTCTGATCAAACCATATGCTACAAGGATAGTGTGTTT +TATGTTGTAAAGAATAGTACAGCTTTTCCATTTGAAACACTTTCAGCATGTCGTGCGTATTTGGATTCACGCACGACACA +GCAGTTAACAATCGAAGTCTTAGTGACTGTCGATGGTGTAAATTTTAGAACAGTCGTTCTAAATAATAAGAACACTTATA +GATCACAGCTTGGATGCGTTTTCTTTAATGGTGCTGATATTTCTGACACCATTCCTGATGAGAAACAGAATGGTCACAGT +TTATATCTAGCAGACAATTTGACTGCTGATGAAACAAAGGCGCTTAAAGAGTTATATGGCCCCGTTGATCCTACTTTCTT +ACACAGATTCTATTCACTTAAGGCTGCAGTCCATGGGTGGAAGATGGTTGTGTGTGATAAGGTACGTTCTCTCAAATTGA +GTGATAATAATTGTTATCTTAATGCAGTTATTATGACACTTGATTTATTGAAGGACATTAAATTTGTTATACCTGCTCTA +CAGCATGCATTTATGAAACATAAGGGCGGTGATTCAACTGACTTCATAGCCCTCATTATGGCTTATGGCAATTGCACATT +TGGTGCTCCAGATGATGCCTCTCGGTTACTTCATACCGTGCTTGCAAAGGCTGAGTTATGCTGTTCTGCACGCATGGTTT +GGAGAGAGTGGTGCAATGTCTGTGGCATAAAAGATGTTGTTCTACAAGGCTTAAAAGCTTGTTGTTACGTGGGTGTGCAA +ACTGTTGAAGATCTGCGTGCTCGCATGACATATGTATGCCAGTGTGGTGGTGAACGTCATCGGCAATTAGTCGAACACAC +CACCCCCTGGTTGCTGCTCTCAGGCACACCAAATGAAAAATTGGTGACAACCTCCACGGCGCCTGATTTTGTAGCATTTA +ATGTCTTTCAGGGCATTGAAACGGCTGTTGGCCATTATGTTCATGCTCGCCTGAAGGGTGGTCTTATTTTAAAGTTTGAC +TCTGGCACCGTTAGCAAGACTTCAGACTGGAAGTGCAAGGTGACAGATGTACTTTTCCCCGGCCAAAAATACAGTAGCGA +TTGTAATGTCGTACGGTATTCTTTGGACGGTAATTTCAGAACAGAGGTTGATCCCGACCTATCTGCTTTCTATGTTAAGG +ATGGTAAATACTTTACAAGTGAACCACCCGTAACATATTCACCAGCTACAATTTTAGCTGGTAGTGTCTACACTAATAGC +TGCCTTGTATCGTCTGATGGACAACCTGGCGGTGATGCTATTAGTTTGAGTTTTAATAACCTTTTAGGGTTTGATTCTAG +TAAACCAGTCACTAAGAAATACACTTACTCCTTCTTGCCTAAAGAAGACGGCGATGTGTTGTTGGCTGAGTTTGACACTT +ATGACCCTATTTATAAGAATGGTGCCATGTATAAAGGCAAACCAATTCTTTGGGTCAATAAAGCATCTTATGATACTAAT +CTTAATAAGTTCAATAGAGCTAGTTTGCGTCAAATTTTTGACGTAGCCCCCATTGAACTCGAAAATAAATTCACACCTTT +GAGTGTGGAGTCTACACCAGTTGAACCTCCAACTGTAGATGTGGTAGCACTTCAACAGGAAATGACAATTGTCAAATGTA +AGGGTTTAAATAAACCTTTCGTGAAGGACAATGTCAGTTTCGTTGCTGATGATTCAGGTACTCCCGTTGTTGAGTATCTG +TCTAAAGAAGACCTACATACATTGTATGTAGACCCTAAGTATCAAGTCATTGTCTTAAAAGACAATGTACTTTCTTCTAT +GCTTAGATTGCACACCGTTGAGTCAGGTGATATTAACGTTGTTGCAGCTTCCGGATCTTTGACACGTAAAGTGAAGTTAC +TATTTAGGGCTTCATTTTATTTCAAAGAATTTGCTACCCGCACTTTCACTGCTACCACTGCTGTAGGTAGTTGTATAAAG +AGTGTAGTGCGGCATCTAGGTGTTACTAAAGGCATATTGACAGGCTGTTTTAGTTTTGCCAAGATGTTATTTATGCTTCC +ACTAGCTTACTTTAGTGATTCAAAACTCGGCACCACAGAGGTTAAAGTGAGTGCTTTGAAAACAGCCGGCGTTGTGACAG +GTAATGTTGTAAAACAGTGTTGCACTGCTGCTGTTGATTTAAGTATGGATAAGTTGCGCCGTGTGGATTGGAAATCAACC +CTACGGTTGTTACTTATGTTATGCACAACTATGGTATTGTTGTCTTCTGTGTATCACTTGTATGTCTTCAATCAGGTCTT +ATCAAGTGATGTTATGTTTGAAGATGCCCAAGGTTTGAAAAAGTTCTACAAAGAAGTTAGAGCTTACCTAGGAATCTCTT +CTGCTTGTGACGGTCTTGCTTCAGCTTATAGGGCGAATTCCTTTGATGTACCTACATTCTGCGCAAACCGTTCTGCAATG +TGTAATTGGTGCTTGATTAGCCAAGATTCCATAACTCACTACCCAGCTCTTAAGATGGTTCAAACACATCTTAGCCACTA +TGTTCTTAACATAGATTGGTTGTGGTTTGCATTTGAGACTGGTTTGGCATACATGCTCTATACCTCGGCCTTCAACTGGT +TGTTGTTGGCAGGTACATTGCATTATTTCTTTGCACAGACTTCCATATTTGTAGACTGGCGGTCATACAATTATGCTGTG +TCTAGTGCCTTCTGGTTATTCACCCACATTCCAATGGCGGGTTTGGTACGAATGTATAATTTGTTAGCATGCCTTTGGCT +TTTACGCAAGTTTTATCAGCATGTAATCAATGGTTGCAAAGATACGGCATGCTTGCTCTGCTATAAGAGGAACCGACTTA +CTAGAGTTGAAGCTTCTACCGTTGTCTGTGGTGGAAAACGTACGTTTTATATCACAGCAAATGGCGGTATTTCATTCTGT +CGTAGGCATAATTGGAATTGTGTGGATTGTGACACTGCAGGTGTGGGGAATACCTTCATCTGTGAAGAAGTCGCAAATGA +CCTCACTACCGCCCTACGCAGGCCTATTAACGCTACGGATAGATCACATTATTATGTGGATTCCGTTACAGTTAAAGAGA +CTGTTGTTCAGTTTAATTATCGTAGAGACGGTCAACCATTCTACGAGCGGTTTCCCCTCTGCGCTTTTACAAATCTAGAT +AAGTTGAAGTTCAAAGAGGTCTGTAAAACTACTACTGGTATACCTGAATACAACTTTATCATCTACGACTCATCAGATCG +TGGCCAGGAAAGTTTAGCTAGGTCTGCATGTGTTTATTATTCTCAAGTCTTGTGTAAATCAATTCTTTTGGTTGACTCAA +GTTTGGTTACTTCTGTTGGTGATTCTAGTGAAATCGCCACTAAAATGTTTGATTCCTTTGTTAATAGTTTCGTCTCGCTG +TATAATGTCACACGCGATAAGTTGGAAAAACTTATCTCTACTGCTCGTGATGGCGTAAGGCGAGGCGATAACTTCCATAG +TGTCTTAACAACATTCATTGACGCAGCACGAGGCCCCGCAGGTGTGGAGTCTGATGTTGAGACCAATGAAATTGTTGACT +CTGTGCAGTATGCTCATAAACATGACATACAAATTACTAATGAGAGCTACAATAATTATGTACCCTCATATGTTAAACCT +GATAGTGTGTCTACCAGCGATTTAGGTAGTCTCATTGATTGTAATGCGGCTTCAGTTAACCAAATTGTCTTGCGTAATTC +TAATGGTGCTTGCATTTGGAACGCTGCTGCATATATGAAACTCTCGGATGCACTTAAACGACAGATTCGCATTGCATGCC +GTAAGTGTAATTTAGCTTTCCGGTTAACCACCTCAAAGCTACGCGCTAATGATAATATCTTATCAGTTAGATTCACTGCT +AACAAAATTGTTGGTGGTGCTCCTACATGGTTTAATGCGTTGCGTGACTTTACGTTAAAGGGTTATGTTCTTGCTACCAT +TATTGTGTTTCTGTGTGCTGTACTGATGTATTTGTGTTTACCTACATTTTCTATGGCACCTGTTGAATTTTATGAAGACC +GCATCTTGGACTTTAAAGTTCTTGATAATGGTATCATTAGGGATGTAAATCCTGATGATAAGTGCTTTGCTAATAAGCAC +CGGTCCTTCACACAATGGTATCATGAGCATGTTGGTGGTGTCTATGACAACTCTATCACATGCCCATTGACAGTTGCAGT +AATTGCTGGAGTTGCTGGTGCTCGCATTCCAGACGTACCTACTACATTGGCTTGGGTGAACAATCAGATAATTTTCTTTG +TTTCTCGAGTCTTTGCTAATACAGGCAGTGTTTGCTACACTCCTATAGATGAGATACCCTATAAGAGTTTCTCTGATAGT +GGTTGCATTCTTCCATCTGAGTGCACTATGTTTAGGGATGCAGAGGGCCGTATGACACCATACTGCCATGATCCTACTGT +TTTGCCTGGGGCTTTTGCGTACAGTCAGATGAGGCCTCATGTTCGTTACGACTTGTATGATGGTAACATGTTTATTAAAT +TTCCTGAAGTAGTATTTGAAAGTACACTTAGGATTACTAGAACTCTGTCAACTCAGTACTGCCGGTTCGGTAGTTGTGAG +TATGCACAAGAGGGTGTTTGTATTACCACAAATGGCTCGTGGGCCATTTTTAATGACCACCATCTTAATAGACCTGGTGT +CTATTGTGGCTCTGATTTTATTGACATTGTCAGGCGGTTAGCAGTATCACTGTTCCAGCCTATTACTTATTTCCAATTGA +CTACCTCATTGGTCTTGGGTATAGGTTTGTGTGCGTTCCTGACTTTGCTCTTCTATTATATTAATAAAGTAAAACGTGCT +TTTGCAGATTACACCCAGTGTGCTGTAATTGCTGTTGTTGCTGCTGTTCTTAATAGCTTGTGCATCTGCTTTGTTACCTC +TATACCATTGTGTATAGTACCTTACACTGCATTGTACTATTATGCTACATTCTATTTTACTAATGAGCCTGCATTTATTA +TGCATGTTTCTTGGTACATTATGTTCGGGCCTATCGTTCCCATATGGATGACCTGCGTCTATACAGTTGCAATGTGCTTT +AGACACTTCTTCTGGGTTTTAGCTTATTTTAGTAAGAAACATGTAGAAGTTTTTACTGATGGTAAGCTTAATTGTAGTTT +CCAGGACGCTGCCTCTAATATCTTTGTTATTAACAAGGACACTTATGCAGCTCTTAGAAACTCTTTAACTAATGATGCCT +ATTCACGATTTTTGGGGTTGTTTAACAAGTATAAGTACTTCTCTGGTGCTATGGAAACAGCCGCTTATCGTGAAGCTGCA +GCATGTCATCTTGCTAAAGCCTTACAAACATACAGCGAGACTGGTAGTGATCTTCTTTACCAACCACCCAACTGTAGCAT +AACCTCTGGCGTGTTGCAAAGCGGTTTGGTGAAAATGTCACATCCCAGTGGAGATGTTGAGGCTTGTATGGTTCAGGTTA +CCTGCGGTAGCATGACTCTTAATGGTCTTTGGCTTGACAACACAGTCTGGTGCCCACGACACGTAATGTGCCCGGCTGAC +CAGTTGTCTGATCCTAATTATGATGCCTTGTTGATTTCTATGACTAATCATAGTTTCAGTGTGCAAAAACACATTGGCGC +TCCAGCAAACTTGCGTGTTGTTGGTCATGCCATGCAAGGCACTCTTTTGAAGTTGACTGTCGATGTTGCTAACCCTAGCA +CTCCAGCCTACACTTTTACAACAGTGAAACCTGGCGCAGCATTTAGTGTGTTAGCATGCTATAATGGTCGTCCGACTGGT +ACATTCACTGTTGTAATGCGCCCTAACTACACAATTAAGGGTTCCTTTCTGTGTGGTTCTTGTGGTAGTGTTGGTTACAC +CAAGGAGGGTAGTGTGATCAATTTCTGTTACATGCATCAAATGGAACTTGCTAATGGTACACATACCGGTTCAGCATTTG +ATGGTACTATGTATGGTGCCTTTATGGATAAACAAGTGCACCAAGTTCAGTTAACAGACAAATACTGCAGTGTTAATGTA +GTAGCTTGGCTTTACGCAGCAATACTTAATGGTTGCGCTTGGTTTGTAAAACCTAATCGCACTAGTGTTGTTTCTTTTAA +TGAATGGGCTCTTGCCAACCAATTCACTGAATTTGTTGGCACTCAATCCGTTGACATGTTAGCTGTCAAAACAGGCGTTG +CTATTGAACAGCTGCTTTATGCGATCCAACAACTGTATACTGGGTTCCAGGGAAAGCAAATCCTTGGCAGTACCATGTTG +GAAGATGAATTCACACCTGAGGATGTTAATATGCAGATTATGGGTGTGGTTATGCAGAGTGGTGTGAGAAAAGTTACATA +TGGTACTGCGCATTGGTTGTTTGCGACCCTTGTCTCAACCTATGTGATAATCTTACAAGCCACTAAATTTACTTTGTGGA +ACTACTTGTTTGAGACTATTCCCACACAGTTGTTCCCACTCTTATTTGTGACTATGGCCTTCGTTATGTTGTTGGTTAAA +CACAAACACACCTTTTTGACACTTTTCTTGTTGCCTGTGGCTATTTGTTTGACTTATGCAAACATAGTCTACGAGCCCAC +TACTCCCATTTCGTCAGCGCTGATTGCAGTTGCAAATTGGCTTGCCCCCACTAATGCTTATATGCGCACTACACATACTG +ATATTGGTGTCTACATTAGTATGTCACTTGTATTAGTCATTGTAGTGAAGAGATTGTACAACCCATCACTTTCTAACTTT +GCGTTAGCATTGTGCAGTGGTGTAATGTGGTTGTACACTTATAGCATTGGAGAAGCCTCAAGCCCCATTGCCTATCTGGT +TTTTGTCACTACACTCACTAGTGATTATACGATTACAGTCTTTGTTACTGTCAACCTTGCAAAAGTTTGCACTTATGCCA +TCTTTGCTTACTCACCACAGCTTACACTTGTGTTTCCGGAAGTGAAGATGATACTTTTATTATACACATGTTTAGGTTTC +ATGTGTACTTGCTATTTTGGTGTCTTCTCTCTTTTGAACCTTAAGCTTAGAGCACCTATGGGTGTCTATGACTTTAAGGT +CTCAACACAAGAGTTCAGATTCATGACTGCTAACAATCTAACTGCACCTAGAAATTCTTGGGAGGCTATGGCTCTGAACT +TTAAGTTAATAGGTATTGGCGGTACACCTTGTATAAAGGTTGCTGCTATGCAGTCTAAACTTACAGATCTTAAATGCACA +TCTGTGGTTCTCCTCTCTGTGCTCCAACAGTTACACTTAGAGGCTAATAGTAGGGCCTGGGCTTTCTGTGTTAAATGCCA +TAATGATATATTGGCAGCAACAGACCCCAGTGAGGCTTTCGAGAAATTCGTAAGTCTCTTTGCTACTTTAATGACTTTTT +CTGGTAATGTAGATCTTGATGCGTTAGCTAGTGATATTTTTGACACTCCTAGCGTACTTCAAGCTACTCTTTCTGAGTTT +TCACACTTAGCTACCTTTGCTGAGTTGGAAGCTGCGCAGAAAGCCTATCAGGAAGCTATGGACTCTGGTGACACCTCACC +ACAAGTTCTTAAGGCTTTGCAGAAGGCTGTTAATATAGCTAAAAACGCCTATGAGAAGGATAAGGCAGTGGCCCGTAAGT +TAGAACGTATGGCTGATCAGGCTATGACTTCTATGTATAAGCAAGCACGTGCTGAAGACAAGAAAGCAAAAATTGTCAGT +GCTATGCAAACTATGTTGTTTGGTATGATTAAGAAGCTCGACAACGATGTTCTTAATGGTATCATTTCTAACGCTAGGAA +TGGTTGTATACCTCTTAGTGTCATCCCACTGTGTGCTTCAAATAAACTTCGCGTTGTAATTCCTGACTTCACCGTCTGGA +ATCAGGTAGTCACATATCCCTCGCTTAACTACGCTGGGGCTTTGTGGGACATTACAGTTATAAACAATGTGGACAATGAA +ATTGTTAAGTCTTCAGATGTTGTAGACAGCAATGAAAATTTAACATGGCCACTTGTTTTAGAATGCACTAGGGCATCCAC +TTCTGCCGTTAAGTTGCAAAATAATGAGATCAAACCTTCAGGTCTAAAAACCATGGTTGTGTCTGCGGGTCAAGAGCAAA +CTAACTGTAATACTAGTTCCTTAGCTTATTACGAACCTGTGCAGGGTCGTAAAATGCTGATGGCTCTTCTTTCTGATAAT +GCCTATCTCAAATGGGCGCGTGTTGAAGGTAAGGACGGATTTGTCAGTGTAGAGCTACAACCTCCTTGCAAATTCTTGAT +TGCGGGACCAAAAGGACCTGAAATCCGATATCTCTATTTTGTTAAAAATCTTAACAACCTTCATCGCGGGCAAGTGTTAG +GGCACATTGCTGCGACTGTTAGATTGCAAGCTGGTTCTAACACCGAGTTTGCCTCTAATTCCTCGGTGTTGTCACTTGTT +AACTTCACCGTTGATCCTCAAAAAGCTTATCTCGATTTCGTCAATGCGGGAGGTGCCCCATTGACAAATTGTGTTAAGAT +GCTTACTCCTAAAACTGGTACAGGTATAGCTATATCTGTTAAACCAGAGAGTACAGCTGATCAAGAGACTTATGGTGGAG +CTTCAGTGTGTCTCTATTGCCGTGCGCATATAGAACATCCTGATGTCTCTGGTGTTTGTAAATATAAGGGTAAGTTTGTC +CAAATCCCTGCTCAGTGTGTCCGTGACCCTGTGGGATTTTGTTTGTCAAATACCCCCTGTAATGTCTGTCAATATTGGAT +TGGATATGGGTGCAATTGTGACTCGCTTAGGCAAGCAGCACTGCCCCAATCTAAAGATTCCAATTTTTTAAACGAGTCCG +GGGTTCTATTGTAAATGCCCGAATAGAACCCTGTTCAAGTGGTTTGTCCACTGATGTCGTCTTTAGGGCATTTGACATCT +GCAACTATAAGGCTAAGGTTGCTGGTATTGGAAAATACTACAAGACTAATACTTGTAGGTTTGTAGAATTAGATGACCAA +GGGCATCATTTAGACTCCTATTTTGTCGTTAAGAGGCATACTATGGAGAATTATGAACTAGAGAAGCACTGTTACGACTT +GTTACGTGACTGTGATGCTGTAGCTCCCCATGATTTCTTCATCTTTGATGTAGACAAAGTTAAAACACCTCATATTGTAC +GTCAGCGTTTAACTGAGTACACTATGATGGATCTTGTATATGCCCTGAGGCACTTTGATCAAAATAGCGAAGTGCTTAAG +GCTATCTTAGTGAAGTATGGTTGCTGTGATGTTACCTACTTTGAAAATAAACTCTGGTTTGATTTTGTTGAAAATCCCAG +TGTTATTGGTGTTTATCATAAACTTGGAGAACGTGTACGCCAAGCTATCTTAAACACTGTTAAATTTTGTGACCACATGG +TCAAGGCTGGTTTAGTCGGTGTGCTCACACTAGACAACCAGGACCTTAATGGCAAGTGGTATGATTTTGGTGACTTCGTA +ATCACTCAACCTGGTTCAGGAGTAGCTATAGTTGATAGCTACTATTCTTATTTGATGCCTGTGCTCTCAATGACCGATTG +TCTGGCCGCTGAGACACATAGGGATTGTGATTTTAATAAACCACTCATTGAGTGGCCACTTACTGAGTATGATTTTACTG +ATTATAAGGTACAACTCTTTGAGAAGTACTTTAAATATTGGGATCAGACGTATCACGCAAATTGCGTTAATTGTACTGAT +GACCGTTGTGTGTTACATTGTGCTAATTTCAATGTATTGTTTGCTATGACCATGCCTAAGACTTGTTTCGGACCCATAGT +CCGAAAGATCTTTGTTGATGGCGTGCCATTTGTAGTATCTTGTGGTTATCACTACAAAGAATTAGGTTTAGTCATGAATA +TGGATGTTAGTCTCCATAGACATAGGCTCTCTCTTAAGGAGTTGATGATGTATGCCGCTGATCCAGCCATGCACATTGCC +TCCTCTAACGCTTTTCTTGATTTGAGGACATCATGTTTTAGTGTCGCTGCACTTACAACTGGTTTGACTTTTCAAACTGT +GCGGCCTGGCAATTTTAACCAAGACTTCTATGATTTCGTGGTATCTAAAGGTTTCTTTAAGGAGGGCTCTTCAGTGACGC +TCAAACATTTTTTCTTTGCTCAAGATGGTAATGCTGCTATTACAGATTATAATTACTATTCTTATAATCTGCCTACTATG +TGTGACATCAAACAAATGTTGTTCTGCATGGAAGTTGTAAACAAGTACTTCGAAATCTATGACGGTGGTTGTCTTAATGC +TTCTGAAGTGGTTGTTAATAATTTAGACAAGAGTGCTGGCCATCCTTTTAATAAGTTTGGCAAAGCTCGTGTCTATTATG +AGAGCATGTCTTACCAGGAGCAAGATGAACTTTTTGCCATGACAAAGCGTAACGTCATTCCTACCATGACTCAAATGAAT +CTAAAATATGCTATTAGTGCTAAGAATAGAGCTCGCACTGTTGCAGGCGTGTCCATACTTAGCACAATGACTAATCGCCA +GTACCATCAGAAAATGCTTAAGTCCATGGCTGCAACTCGTGGAGCGACTTGCGTCATTGGTACTACAAAGTTCTACGGTG +GCTGGGATTTCATGCTTAAAACATTGTACAAAGATGTTGATAATCCGCATCTTATGGGTTGGGATTACCCTAAGTGTGAT +AGAGCTATGCCTAATATGTGTAGAATCTTCGCTTCACTCATATTAGCTCGTAAACATGGCACTTGTTGTACTACAAGGGA +CAGATTTTATCGCTTGGCAAATGAGTGTGCTCAGGTGCTAAGCGAATATGTTCTATGTGGTGGTGGTTACTACGTCAAAC +CTGGAGGTACCAGTAGCGGAGATGCCACCACTGCATATGCCAATAGTGTCTTTAACATTTTGCAGGCGACAACTGCTAAT +GTCAGTGCACTTATGGGTGCTAATGGCAACAAGATTGTTGACAAAGAAGTTAAAGACATGCAGTTTGATTTGTATGTCAA +TGTTTACAGGAGCACTAGCCCAGACCCCAAATTTGTTGATAAATACTATGCTTTTCTTAATAAGCACTTTTCTATGATGA +TACTGTCTGATGACGGTGTCGTTTGCTATAATAGTGATTATGCAGCTAAGGGTTACATTGCTGGAATACAGAATTTTAAG +GAAACGCTGTATTATCAGAACAATGTCTTTATGTCTGAAGCTAAATGCTGGGTGGAAACCGATCTGAAGAAAGGGCCACA +TGAATTCTGTTCACAGCATACGCTTTATATTAAGGATGGCGACGATGGTTACTTCCTTCCTTATCCAGACCCTTCAAGAA +TTTTGTCTGCCGGTTGCTTTGTAGATGATATCGTTAAGACTGACGGTACACTCATGGTAGAGCGGTTTGTGTCTTTGGCT +ATAGATGCTTACCCTCTCACAAAGCATGAAGATATAGAATACCAGAATGTATTCTGGGTCTACTTACAGTATATAGAAAA +ACTGTATAAAGACCTTACAGGACACATGCTTGACAGTTATTCTGTCATGCTATGTGGTGATAATTCTGCTAAGTTTTGGG +AAGAGGCATTCTATAGAGATCTCTATAGTTCGCCTACCACTTTGCAGGCTGTCGGTTCATGCGTTGTATGCCATTCACAG +ACTTCCCTACGCTGTGGGACATGCATCCGTAGACCATTTCTCTGCTGTAAATGCTGCTATGATCATGTTATAGCAACTCC +ACATAAGATGGTTTTGTCTGTTTCTCCTTACGTTTGTAATGCCCCTGGTTGTGGCGTTTCAGACGTTACTAAGCTATATT +TAGGTGGTATGAGCTACTTTTGTGTAGATCATAGACCTGTGTGTAGTTTTCCACTTTGCGCTAATGGTCTTGTATTCGGC +TTATACAAGAATATGTGCACAGGTAGTCCTTCTATAGTTGAATTTAATAGGTTGGCTACCTGTGACTGGACTGAAAGTGG +TGATTACACCCTTGCCAATACTACAACAGAACCACTCAAACTTTTTGCTGCTGAGACTTTACGTGCCACTGAAGAGGCGT +CTAAGCAGTCTTATGCTATTGCCACCATCAAAGAAATTGTTGGTGAGCGCCAACTATTACTTGTGTGGGAGGCTGGCAAG +TCCAAACCACCACTCAATCGTAATTATGTTTTTACTGGTTATCATATAACCAAAAATAGTAAAGTGCAGCTCGGTGAGTA +CATTTTCGAGCGCATTGATTATAGTGATGCTGTATCCTACAAGTCTAGTACAACGTATAAACTGACTGTAGGTGACATCT +TCGTACTTACCTCTCACTCTGTGGCTACCTTGACGGCGCCCACAATTGTGAATCAAGAGAGGTATGTTAAAATTACTGGG +TTGTACCCAACCATTACGGTACCTGAAGAGTTCGCAAGTCATGTTGCCAACTTCCAAAAATCAGGTTATAGTAAATATGT +CACTGTTCAGGGACCACCTGGCACTGGCAAAAGTCATTTTGCTATAGGGTTAGCGATTTACTACCCTACAGCACGTGTTG +TTTATACAGCATGTTCACACGCAGCTGTTGATGCTTTGTGTGAAAAAGCTTTTAAATATTTGAACATTGCTAAATGTTCC +CGTATCATTCCTGCAAAGGCACGTGTTGAGTGCTATGACAGGTTTAAAGTTAATGAGACAAATTCTCAATATTTGTTTAG +TACTATTAATGCTCTACCAGAAACTTCTGCCGATATTCTGGTGGTTGATGAGGTTAGTATGTGCACTAATTATGATCTTT +CAATTATTAATGCACGTATTAAAGCTAAGCACATTGTCTATGTAGGAGATCCAGCACAGTTGCCAGCTCCTAGGACTTTG +TTGACTAGAGGCACATTGGAACCAGAAAATTTCAATAGTGTCACTAGATTGATGTGTAACTTAGGTCCTGACATATTTTT +AAGTATGTGCTACAGGTGTCCTAAGGAAATAGTAAGCACTGTGAGCGCTCTTGTCTACAATAATAAATTGTTAGCCAAGA +AGGAGCTTTCAGGCCAGTGCTTTAAAATACTCTATAAGGGCAATGTGACGCATGATGCTAGCTCTGCCATTAATAGACCA +CAACTCACATTTGTGAAGAATTTTATTACTGCCAATCCGGCATGGAGTAAGGCAGTCTTTATTTCGCCTTACAATTCACA +GAATGCTGTGTCTCGTTCAATGCTGGGTCTTACCACTCAGACTGTTGATTCCTCACAGGGTTCAGAATACCAGTACGTTA +TCTTCTGTCAAACAGCAGATACGGCACATGCTAACAACATTAACAGATTTAATGTTGCAATCACTCGTGCCCAAAAAGGT +ATTCTTTGTGTTATGACATCTCAGGCACTCTTTGAGTCCTTAGAGTTTACTGAATTGTCTTTTACTAATTACAAGCTCCA +GTCTCAGATTGTAACTGGCCTTTTTAAAGATTGCTCTAGAGAAACTTCTGGCCTCTCACCTGCTTATGCACCAACATATG +TTAGTGTTGATGACAAGTATAAGACGAGTGATGAGCTTTGCGTGAATCTTAATTTACCCGCAAATGTCCCATACTCTCGT +GTTATTTCCAGGATGGGCTTTAAACTCGATGCAACAGTTCCTGGATATCCTAAGCTTTTCATTACTCGTGAAGAGGCTGT +AAGGCAAGTTCGAAGCTGGATAGGCTTCGATGTTGAGGGTGCTCATGCTTCCCGTAATGCATGTGGCACCAATGTGCCTC +TACAATTAGGATTTTCAACTGGTGTGAACTTTGTTGTTCAGCCAGTTGGTGTTGTAGACACTGAGTGGGGTAACATGTTA +ACGGGCATTGCTGCACGTCCTCCACCAGGTGAACAGTTTAAGCACCTCGTGCCTCTTATGCATAAGGGGGCTGCGTGGCC +TATTGTTAGACGACGTATAGTGCAAATGTTGTCAGACACTTTAGACAAATTGTCTGATTACTGTACGTTTGTTTGTTGGG +CTCATGGCTTTGAATTAACGTCTGCATCATACTTTTGCAAGATAGGTAAGGAACAGAAGTGTTGCATGTGCAATAGACGC +GCTGCAGCGTACTCTTCACCTCTGCAATCTTATGCCTGCTGGACTCATTCCTGCGGTTATGATTATGTCTACAACCCTTT +CTTTGTCGATGTTCAACAGTGGGGTTATGTAGGCAATCTTGCTACTAATCACGATCGTTATTGCTCTGTCCATCAAGGAG +CTCATGTGGCTTCTAATGATGCAATAATGACTCGTTGTTTAGCTATTCATTCTTGTTTTATAGAACGTGTGGATTGGGAT +ATAGAGTATCCTTATATCTCACATGAAAAGAAATTGAATTCCTGTTGTAGAATCGTTGAGCGCAACGTCGTACGTGCTGC +TCTTCTTGCCGGTTCATTTGACAAAGTCTATGATATTGGCAATCCTAAAGGAATTCCTATTGTTGATGACCCTGTGGTTG +ATTGGCATTATTTTGATGCACAGCCCTTGACCAGGAAGGTACAACAGCTTTTCTATACAGAGGACATGGCCTCAAGATTT +GCTGATGGGCTCTGCTTATTTTGGAACTGTAATGTACCAAAATATCCTAATAATGCAATTGTATGCAGGTTTGACACACG +TGTGCATTCTGAGTTCAATTTGCCAGGTTGTGATGGCGGTAGTTTGTATGTTAACAAGCACGCTTTTCATACACCAGCAT +ATGATGTGAGTGCATTCCGTGATCTGAAACCTTTACCATTCTTTTATTATTCTACTACACCATGTGAAGTGCATGGTAAT +GGTAGTATGATAGAGGATATTGATTATGTACCCCTAAAATCTGCAGTCTGTATTACAGCTTGTAATTTAGGGGGCGCTGT +TTGTAGGAAGCATGCTACAGAGTACAGAGAGTATATGGAAGCATATAATCTTGTCTCTGCATCAGGTTTCCGCCTTTGGT +GTTATAAGACCTTTGATATTTATAATCTCTGGTCTACTTTTACAAAAGTTCAAGGTTTGGAAAACATTGCTTTTAATGTT +GTTAAACAAGGCCATTTTATTGGTGTTGAGGGTGAACTACCTGTAGCTGTAGTCAATGATAAGATCTTCACCAAGAGTGG +CGTTAATGACATTTGTATGTTTGAGAATAAAACCACTTTGCCTACTAATATAGCTTTTGAACTCTATGCTAAGCGTGCTG +TACGCTCGCATCCCGATTTCAAATTGCTACACAATTTACAAGCAGACATTTGCTACAAGTTCGTCCTTTGGGATTATGAA +CGTAGCAATATTTATGGTACTGCTACTATTGGTGTATGTAAGTACACTGATATTGATGTTAATTCAGCTTTGAATATATG +TTTTGACATACGCGATAATTGTTCATTGGAGAAGTTCATGTCTACTCCCAATGCCATCTTTATTTCTGATAGAAAAATCA +AGAAATACCCTTGTATGGTAGGTCCTGATTATGCTTACTTCAATGGTGCTATCATCCGTGATAGTGATGTTGTTAAACAA +CCAGTGAAGTTCTACTTGTATAAGAAAGTCAATAATGAGTTTATTGATCCTACTGAGTGTATTTACACTCAGAGTCGCTC +TTGTAGTGACTTCCTACCCCTTTCTGACATGGAGAAAGACTTTCTATCTTTTGATAGTGATGTTTTCATTAAGAAGTATG +GCTTGGAAAACTATGCTTTTGAGCACGTAGTCTATGGAGACTTCTCTCATACTACGTTAGGCGGTCTTCACTTGCTTATT +GGTTTATACAAGAAGCAACAGGAAGGTCATATTATTATGGAAGAAATGCTAAAAGGTAGCTCAACTATTCATAACTATTT +TATTACTGAGACTAACACAGCGGCTTTTAAGGCGGTGTGTTCTGTTATAGATTTAAAGCTTGACGACTTTGTTATGATTT +TAAAGAGTCAAGACCTTGGCGTAGTATCCAAGGTTGTCAAGGTTCCTATTGACTTAACAATGATTGAGTTTATGTTATGG +TGTAAGGATGGACAGGTTCAAACCTTCTACCCTCGACTCCAGGCTTCTGCAGATTGGAAACCTGGTCATGCAATGCCATC +CCTCTTTAAAGTTCAAAATGTAAACCTTGAACGTTGTGAGCTTGCTAATTACAAGCAATCTATTCCTATGCCTCGCGGTG +TGCACATGAACATCGCTAAATATATGCAATTGTGCCAGTATTTAAATACTTGCACATTAGCCGTGCCTGCCAATATGCGT +GTTATACATTTTGGCGCTGGTTCTGATAAAGGTATCGCTCCTGGTACCTCAGTTTTACGACAGTGGCTTCCTACAGATGC +CATTATTATAGATAATGATTTAAATGAGTTCGTGTCAGATGCTGACATAACTTTATTTGGAGATTGTGTAACTGTACGTG +TCGGCCAACAAGTGGATCTTGTTATTTCCGACATGTATGATCCTACTACTAAGAATGTAACAGGTAGTAATGAGTCAAAG +GCTTTATTCTTTACTTACCTGTGTAACCTCATTAATAATAATCTTGCTCTTGGTGGGTCTGTTGCTATTAAAATAACAGA +ACACTCTTGGAGCGTTGAACTTTATGAACTTATGGGAAAATTTGCTTGGTGGACTGTTTTCTGCACCAATGCAAATGCAT +CCTCATCTGAAGGATTCCTCTTAGGTATTAATTACTTGGGTACTATTAAAGAAAATATAGATGGTGGTGCTATGCACGCC +AACTATATATTTTGGAGAAATTCCACTCCTATGAATCTGAGTACTTACTCACTTTTTGATTTATCCAAGTTTCAATTAAA +ATTAAAAGGAACACCAGTTCTTCAATTAAAGGAGAGTCAAATTAACGAACTCGTAATATCTCTCCTGTCGCAGGGTAAGT +TACTTATCCGTGACAATGATACACTCAGTGTTTCTACTGATGTTCTTGTTAACACCTACAGAAAGTTACGTTGATGTAGG +GCCAGATTCTGTTAAGTCTGCTTGTATTGAGGTTGATATACAACAGACTTTCTTTGATAAAACTTGGCCTAGGCCAATTG +ATGTTTCTAAGGCTGACGGTATTATATACCCTCAAGGCCGTACATATTCTAACATAACTATCACTTATCAAGGTCTTTTT +CCCTATCAGGGAGACCATGGTGATATGTATGTTTACTCTGCAGGACATGCTACAGGCACAACTCCACAAAAGTTGTTTGT +AGCTAACTATTCTCAGGACGTCAAACAGTTTGCTAATGGGTTTGTCGTCCGTATAGGAGCAGCTGCCAATTCCACTGGCA +CTGTTATTATTAGCCCATCTACCAGCGCTACTATACGAAAAATTTACCCTGCTTTTATGCTGGGTTCTTCAGTTGGTAAT +TTCTCAGATGGTAAAATGGGCCGCTTCTTCAATCATACTCTAGTTCTTTTGCCCGATGGATGTGGCACTTTACTTAGAGC +TTTTTATTGTATTCTAGAGCCTCGCTCTGGAAATCATTGTCCTGCTGGCAATTCCTATACTTCTTTTGCCACTTATCACA +CTCCTGCAACAGATTGTTCTGATGGCAATTACAATCGTAATGCCAGTCTGAACTCTTTTAAGGAGTATTTTAATTTACGT +AACTGCACCTTTATGTACACTTATAACATTACCGAAGATGAGATTTTAGAGTGGTTTGGCATTACACAAACTGCTCAAGG +TGTTCACCTCTTCTCATCTCGGTATGTTGATTTGTACGGCGGCAATATGTTTCAATTTGCCACCTTGCCTGTTTATGATA +CTATTAAGTATTATTCTATCATTCCTCACAGTATTCGTTCTATCCAAAGTGATAGAAAAGCTTGGGCTGCCTTCTACGTA +TATAAACTTCAACCGTTAACTTTCCTGTTGGATTTTTCTGTTGATGGTTATATACGCAGAGCTATAGACTGTGGTTTTAA +TGATTTGTCACAACTCCACTGCTCATATGAATCCTTCGATGTTGAATCTGGAGTTTATTCAGTTTCGTCTTTCGAAGCAA +AACCTTCTGGCTCAGTTGTGGAACAGGCTGAAGGTGTTGAATGTGATTTTTCACCTCTTCTGTCTGGCACACCTCCTCAG +GTTTATAATTTCAAGCGTTTGGTTTTTACCAATTGCAATTATAATCTTACCAAATTGCTTTCACTTTTTTCTGTGAATGA +TTTTACTTGTAGTCAAATATCTCCAGCAGCAATTGCTAGCAACTGTTATTCTTCACTGATTTTGGATTACTTTTCATACC +CACTTAGTATGAAATCCGATCTCAGTGTTAGTTCTGCTGGTCCAATATCCCAGTTTAATTATAAACAGTCCTTTTCTAAT +CCCACATGTTTGATTTTAGCGACTGTTCCTCATAACCTTACTACTATTACTAAGCCTCTTAAGTACAGCTATATTAACAA +GTGCTCTCGTCTTCTTTCTGATGATCGTACTGAAGTACCTCAGTTAGTGAACGCTAATCAATACTCACCCTGTGTATCCA +TTGTCCCATCCACTGTGTGGGAAGACGGTGATTATTATAGGAAACAACTATCTCCACTTGAAGGTGGTGGCTGGCTTGTT +GCTAGTGGCTCAACTGTTGCCATGACTGAGCAATTACAGATGGGCTTTGGTATTACAGTTCAATATGGTACAGACACCAA +TAGTGTTTGCCCCAAGCTTGAATTTGCTAATGACACAAAAATTGCCTCTCAATTAGGCAATTGCGTGGAATATTCCCTCT +ATGGTGTTTCGGGCCGTGGTGTTTTTCAGAATTGCACAGCTGTAGGTGTTCGACAGCAGCGCTTTGTTTATGATGCGTAC +CAGAATTTAGTTGGCTATTATTCTGATGATGGCAACTACTACTGTTTGCGTGCTTGTGTTAGTGTTCCTGTTTCTGTCAT +CTATGATAAAGAAACTAAAACCCACGCTACTCTATTTGGTAGTGTTGCATGTGAACACATTTCTTCTACCATGTCTCAAT +ACTCCCGTTCTACGCGATCAATGCTTAAACGGCGAGATTCTACATATGGCCCCCTTCAGACACCTGTTGGTTGTGTCCTA +GGACTTGTTAATTCCTCTTTGTTCGTAGAGGACTGCAAGTTGCCTCTTGGTCAATCTCTCTGTGCTCTTCCTGACACACC +TAGTACTCTCACACCTCGCAGTGTGCGCTCTGTTCCAGGTGAAATGCGCTTGGCATCCATTGCTTTTAATCATCCTATTC +AGGTTGATCAACTTAATAGTAGTTATTTTAAATTAAGTATACCCACTAATTTTTCCTTTGGTGTGACTCAGGAGTACATT +CAGACAACCATTCAGAAAGTTACTGTTGATTGTAAACAGTACGTTTGCAATGGTTTCCAGAAGTGTGAGCAATTACTGCG +CGAGTATGGCCAGTTTTGTTCCAAAATAAACCAGGCTCTCCATGGTGCCAATTTACGCCAGGATGATTCTGTACGTAATT +TGTTTGCGAGCGTGAAAAGCTCTCAATCATCTCCTATCATACCAGGTTTTGGAGGTGACTTTAATTTGACACTTCTAGAA +CCTGTTTCTATATCTACTGGCAGTCGTAGTGCACGTAGTGCTATTGAGGATTTGCTATTTGACAAAGTCACTATAGCTGA +TCCTGGTTATATGCAAGGTTACGATGATTGCATGCAGCAAGGTCCAGCATCAGCTCGTGATCTTATTTGTGCTCAATATG +TGGCTGGTTACAAAGTATTACCTCCTCTTATGGATGTTAATATGGAAGCCGCGTATACTTCATCTTTGCTTGGCAGCATA +GCAGGTGTTGGCTGGACTGCTGGCTTATCCTCCTTTGCTGCTATTCCATTTGCACAGAGTATCTTTTATAGGTTAAACGG +TGTTGGCATTACTCAACAGGTTCTTTCAGAGAACCAAAAGCTTATTGCCAATAAGTTTAATCAGGCTCTGGGAGCTATGC +AAACAGGCTTCACTACAACTAATGAAGCTTTTCAGAAGGTTCAGGATGCTGTGAACAACAATGCACAGGCTCTATCCAAA +TTAGCTAGCGAGCTATCTAATACTTTTGGTGCTATTTCCGCCTCTATTGGAGACATCATACAACGTCTTGATGTTCTCGA +ACAGGACGCCCAAATAGACAGACTTATTAATGGCCGTTTGACAACACTAAATGCTTTTGTTGCACAGCAGCTTGTTCGTT +CCGAATCAGCTGCTCTTTCCGCTCAATTGGCTAAAGATAAAGTCAATGAGTGTGTCAAGGCACAATCCAAGCGTTCTGGA +TTTTGCGGTCAAGGCACACATATAGTGTCCTTTGTTGTAAATGCCCCTAATGGCCTTTACTTCATGCATGTTGGTTATTA +CCCTAGCAACCACATTGAGGTTGTTTCTGCTTATGGTCTTTGCGATGCAGCTAACCCTACTAATTGTATAGCCCCTGTTA +ATGGCTACTTTATTAAAACTAATAACACTAGGATTGTTGATGAGTGGTCATATACTGGCTCGTCCTTCTATGCACCTGAG +CCCATTACCTCCCTTAATACTAAGTATGTTGCACCACAGGTGACATACCAAAACATTTCTACTAACCTCCCTCCTCCTCT +TCTCGGCAATTCCACCGGGATTGACTTCCAAGATGAGTTGGATGAGTTTTTCAAAAATGTTAGCACCAGTATACCTAATT +TTGGTTCCCTAACACAGATTAATACTACATTACTCGATCTTACCTACGAGATGTTGTCTCTTCAACAAGTTGTTAAAGCC +CTTAATGAGTCTTACATAGACCTTAAAGAGCTTGGCAATTATACTTATTACAACAAATGGCCGTGGTACATTTGGCTTGG +TTTCATTGCTGGGCTTGTTGCCTTAGCTCTATGCGTCTTCTTCATACTGTGCTGCACTGGTTGTGGCACAAACTGTATGG +GAAAACTTAAGTGTAATCGTTGTTGTGATAGATACGAGGAATACGACCTCGAGCCGCATAAGGTTCATGTTCACTAATTA +ACGAACTATTAATGAGAGTTCAAAGACCACCCACTCTCTTGTTAGTGTTTTCACTCTCTCTTTTGGTCACTGCATCCTCA +AAACCTCTCTATGTACCTGAGCATTGTCAGAATTATTCTGGTTGCATGCTTAGGGCTTGTATTAAAACTGCCCAAGCTGA +TACAGCTGGTCTTTATACAAATTTTCGAATTGACGTCCCATCTGCAGAATCAACTGGTACTCAATCAGTTTCTGTCGATC +TTGAGTCAACTTCAACTCATGATGGTCCTACCGAACATGTTACTAGTGTGAATCTTTTTGACGTTGGTTACTCAGTTAAT +TAACGAACTCTATGGATTACGTGTCTCTGCTTAATCAAATTTGGCAGAAGTACCTTAACTCACCGTATACTACTTGTTTG +TACATCCCTAAACCCACAGCTAAGTATACACCTTTAGTTGGCACTTCATTGCACCCTGTGCTGTGGAACTGTCAGCTATC +CTTTGCTGGTTATACTGAATCTGCTGTTAATTCTACAAAAGCTTTGGCCAAACAGGACGCAGCTCAGCGAATCGCTTGGT +TGCTACATAAGGATGGAGGAATCCCTGATGGATGTTCCCTCTACCTCCGGCACTCAAGTTTATTCGCGCAAAGCGAGGAA +GAGGAGCCATTCTCCAACTAAGAAACTGCGCTACGTTAAGCGTAGATTTTCTCTTCTGCGCCATGAAGACCTTAGTGTTA +TTGTCCAACCAACACACTATGTCAGGGTTACATTTTCAGACCCCAACATGTGGTATCTACGTTCGGGTCATCATTTACAC +TCAGTTCACAATTGGCTTAAACCTTATGGCGGCCAACCTGTTTCTGAGTACCATATTACTCTAGCTTTGCTAAATCTCAC +TGATGAAGATTTAGCTAGAGATTTTTCACCCATTGCGCTCTTTTTGCGCAATGTCAGATTTGAGCTACATGAGTTCGCCT +TGCTGCGCAAAACTCTTGTTCTTAATGCATCAGAGATCTACTGTGCTAACATACATAGATTTAAGCCTGTGTATAGAGTT +AACACGGCAATCCCTACTATTAAGGATTGGCTTCTCGTTCAGGGATTTTCCCTTTACCATAGTGGCCTCCCTTTACATAT +GTCAATCTCTAAATTGCATGCACTGGATGATGTTACTCGCAATTACATCATTACAATGCCATGCTTTAGAACTTACCCTC +AACAAATGTTTGTTACTCCTTTGGCCGTAGATGTTGTCTCCATACGGTCTTCCAATCAGGGTAATAAACAAATTGTTCAT +TCTTATCCCATTTTACATCATCCAGGATTTTAACGAACTATGGCTTTCTCGGCGTCTTTATTTAAACCCGTCCAGCTAGT +CCCAGTTTCTCCTGCATTTCATCGCATTGAGTCTACTGACTCTATTGTTTTCACATACATTCCTGCTAGCGGCTATGTAG +CTGCTTTAGCTGTCAATGTGTGTCTCATTCCCCTATTATTACTGCTACGTCAAGATACTTGTCGTCGCAGCATTATCAGA +ACTATGGTTCTCTATTTCCTTGTTCTGTATAACTTTTTATTAGCCATTGTACTAGTCAATGGTGTACATTATCCAACTGG +AAGTTGCCTGATAGCCTTCTTAGTTATCCTCATAATACTTTGGTTTGTAGATAGAATTCGTTTCTGTCTCATGCTGAATT +CCTACATTCCACTGTTTGACATGCGTTCCCACTTTATTCGTGTTAGTACAGTTTCTTCTCATGGTATGGTCCCTGTAATA +CACACCAAACCATTATTTATTAGAAACTTCGATCAGCGTTGCAGCTGTTCTCGTTGTTTTTATTTGCACTCTTCCACTTA +TATAGAGTGCACTTATATTAGCCGTTTTAGTAAGATTAGCCTAGTTTCTGTAACTGACTTCTCCTTAAACGGCAATGTTT +CCACTGTTTTCGTGCCTGCAACGCGCGATTCAGTTCCTCTTCACATAATCGCCCCGAGCTCGCTTATCGTTTAAGCAGCT +CTGCGCTACTATGGGTCCCGTGTAGAGGCTAATCCATTAGTCTCTCTTTGGACATATGGAAAACGAACTATGTTACCCTT +TGTCCAAGAACGAATAGGGTTGTTCATAGTAAACTTTTTCATTTTTACCGTAGTATGTGCTATAACACTCTTGGTGTGTA +TGGCTTTCCTTACGGCTACTAGATTATGTGTGCAATGTATGACAGGCTTCAATACCCTGTTAGTTCAGCCCGCATTATAC +TTGTATAATACTGGACGTTCAGTCTATGTAAAATTCCAGGATAGTAAACCCCCTCTACCACCTGACGAGTGGGTTTAACG +AACTCCTTCATAATGTCTAATATGACGCAACTCACTGAGGCGCAGATTATTGCCATTATTAAAGACTGGAACTTTGCATG +GTCCCTGATCTTTCTCTTAATTACTATCGTACTACAGTATGGATACCCATCCCGTAGTATGACTGTCTATGTCTTTAAAA +TGTTTGTTTTATGGCTCCTATGGCCATCTTCCATGGCGCTATCAATATTTAGCGCCGTTTATCCAATTGATCTAGCTTCC +CAGATAATCTCTGGCATTGTAGCAGCTGTTTCAGCTATGATGTGGATTTCCTACTTTGTGCAGAGTATCCGGCTGTTTAT +GAGAACTGGATCATGGTGGTCATTCAATCCTGAGACTAATTGCCTTTTGAACGTTCCATTTGGTGGTACAACTGTCGTAC +GTCCACTCGTAGAGGACTCTACCAGTGTAACTGCTGTTGTAACCAATGGCCACCTCAAAATGGCTGGCATGCATTTCGGT +GCTTGTGACTACGACAGACTTCCTAATGAAGTCACCGTGGCCAAACCCAATGTGCTGATTGCTTTAAAAATGGTGAAGCG +GCAAAGCTACGGAACTAATTCCGGCGTTGCCATTTACCATAGATATAAGGCAGGTAATTACAGGAGTCCGCCTATTACGG +CGGATATTGAACTTGCATTGCTTCGAGCTTAGGCTCTTTAGTAAGAGTATCTTAATTGATTTTAACGAATCTCAATTTCA +TTGTTATGGCATCCCCTGCTGCACCTCGTGCTGTTTCCTTTGCCGATAACAATGATATAACAAATACAAACCTATCTCGA +GGTAGAGGACGTAATCCAAAACCACGAGCTGCACCAAATAACACTGTCTCTTGGTACACTGGGCTTACCCAACACGGGAA +AGTCCCTCTTACCTTTCCACCTGGGCAGGGTGTACCTCTTAATGCCAATTCTACCCCTGCGCAAAATGCTGGGTATTGGC +GGAGACAGGACAGAAAAATTAATACCGGGAATGGAATTAAGCAACTGGCTCCCAGGTGGTACTTCTACTACACTGGAACT +GGACCCGAAGCAGCACTCCCATTCCGGGCTGTTAAGGATGGCATCGTTTGGGTCCATGAAGATGGCGCCACTGATGCTCC +TTCAACTTTTGGGACGCGGAACCCTAACAATGATTCAGCTATTGTTACACAATTCGCGCCCGGTACTAAGCTTCCTAAAA +ACTTCCACATTGAGGGGACTGGAGGCAATAGTCAATCATCTTCAAGAGCCTCTAGCTTAAGCAGAAACTCTTCCAGATCT +AGTTCACAAGGTTCAAGATCAGGAAACTCTACCCGCGGCACTTCTCCAGGTCCATCTGGAATCGGAGCAGTAGGAGGTGA +TCTACTTTACCTTGATCTTCTGAACAGACTACAAGCCCTTGAGTCTGGCAAAGTAAAGCAATCGCAGCCAAAAGTAATCA +CTAAGAAAGATGCTGCTGCTGCTAAAAATAAGATGCGCCACAAGCGCACTTCCACCAAAAGTTTCAACATGGTGCAAGCT +TTTGGTCTTCGCGGACCAGGAGACCTCCAGGGAAACTTTGGTGATCTTCAATTGAATAAACTCGGCACTGAGGACCCACG +TTGGCCCCAAATTGCTGAGCTTGCTCCTACAGCCAGTGCTTTTATGGGTATGTCGCAATTTAAACTTACCCATCAGAACA +ATGATGATCATGGCAACCCTGTGTACTTCCTTCGGTACAGTGGAGCCATTAAACTTGACCCAAAGAATCCCAACTACAAT +AAGTGGTTGGAGCTTCTTGAGCAAAATATTGATGCCTACAAAACCTTCCCTAAGAAGGAAAAGAAACAAAAGGCACCAAA +AGAAGAATCAACAGACCAAATGTCTGAACCTCCAAAGGAGCAGCGTGTGCAAGGTAGCATCACTCAGCGCACTCGCACCC +GTCCAAGTGTTCAGCCTGGTCCAATGATTGATGTTAACACTGATTAGTGTCACTCAAAGTAACAAGATCGCGGCAATCGT +TTGTGTTTGGCAACCCCATCTCACCATCGCTTGTCCACTCTTGCACAGAATGGAATCATGTTGTAATTACAGTGCAATAA +GGTAATTATAACCCATTTAATTGATAGCTATGCTTTATTAAAGTGTGTAGCTGTAGAGAGAATGTTAAAGACTGTCACCT +CTGCTTGATTGCAAGTGAACAGTGCCCCCCGGGAAGAGCTCTACAGTGTGAAATGTAAATAAAAAATAGCTATTATTCAA +TTAGATTAGGCTAATTAGATGATTTGCAAAAAAAAAAAA diff --git a/data/library/viral/assembly_summary_refseq.txt b/data/library/viral/assembly_summary_refseq.txt new file mode 100644 index 0000000..f1fdaef --- /dev/null +++ b/data/library/viral/assembly_summary_refseq.txt @@ -0,0 +1,3 @@ +GCF_000864765.1 PRJNA485481 na na reference genome 11676 11676 Human immunodeficiency virus 1 na na latest Complete Genome Major Full 2000/08/01 ViralProj15476 NIH, NLM GCA_000864765.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/864/765/GCF_000864765.1_ViralProj15476 na ICTV species exemplar na haploid viral 9181 9181 42.000000 1 1 1NCBI RefSeq Annotation submitted by NCBI RefSeq 2018/08/13 10 10 0 9362478 +GCF_000865725.1 PRJNA485481 na na na 211044 2955291 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) strain=A/Puerto Rico/8/1934 na latest Complete Genome Major Full 2000/06/12 ViralMultiSegProj15521 NCBI GCA_000865725.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/865/725/GCF_000865725.1_ViralMultiSegProj15521 na ICTV species exemplar na haploid viral 1358813588 43.500000 8 8 8 NCBI RefSeq Annotation submitted by NCBI RefSeq 2018/08/13 12 12 0 7010182;7060132;7278968;6281731;7208353;6927841;7465426;7292985 +GCF_009858895.2 PRJNA485481 na na reference genome 2697049 694009 Severe acute respiratory syndrome coronavirus 2 na Wuhan-Hu-1 latest Complete Genome Major Full 2020/01/17 ASM985889v3 Shanghai Public Health Clinical Center & School of Public Health, Fudan University GCA_009858895.3 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3 na ICTV additional isolate na haploid viral 29903 29903 38.000000 1 1 1 NCBI RefSeq Annotation submitted by NCBI RefSeq 2020/07/18 11 11 0 32015508 diff --git a/data/library/viral/refseq/GCF_000864765.1_ViralProj15476_genomic.fna.gz b/data/library/viral/refseq/GCF_000864765.1_ViralProj15476_genomic.fna.gz new file mode 100644 index 0000000000000000000000000000000000000000..9b21b44cd8bbeef63c8b4d8efdf49e939e03efd9 GIT binary patch literal 3052 zcmV4*|H^sG-AtE`Xc2~3alzoOYrSo z8Q;j<2N6W--hI|yYt5hhjf`aO-~29r|LN1`Uw``5FF(Ki_UE@he1H4;?c2}qKmPds zr>|eWzJLGnueX1G{oBvKzkUA2+m|2T{`!~qf4sl_`ThHkZ||q7Q~#cej*86ZW4=b^ z+?>CDo*nW0Klh&3^0|I)J@@9tjCih|%hNsIXS^Oi531*S{yn2+xo0JP4X?|zT31ue zE}o&YpU%VJXUzz0#oRv+zt zwxfN`h>~Gsxg2pZ_`D^}*uWu)a;BfVvzoSbr258S(i?(?+-H?AR*IEaTh&}q{xe=U z-=2?SV?DFamZYze1QHAb&-3v=R5BxauF{HZsasOiqr>)Y3Tv+prJ_u<(<8)u8FlkS zTFP~hi5mq^NQYWM>S)Cma&%#R)F(;xf_BqZVqv4Xh0;!1LRnI}!~*FH{3J1t7M0AU z86>f+QWoS1J%UZTkiVnM_C5Jc$%muYkF=iFkWQMgP|(Ej)-}MsIhW1Iz^f5KCX?yX zTAY{V`}pj9Irc6C23qN`Un`!H$Pw1ySy}d>5k#UCwtbd8R~8VY1y-X(8q_SH@!q#7lR z1b4V?0OaM#DG4SxKPk3}I^39eD7&WStg<^uS2^D3^Yker1_F_7+oC^N(eyB zXgveGSHe}+bBb1E)rT!K{?{^}Jd<~^@tuQ&jah3VB`-v_w57??_@dA#qPJsn*9u6O z2{t9c^Xy2Q#K1=2O@13)2H#l`35+J!1yz;Nh(0RF~8O)SvLt|dA!-w4eHZr{M9zn{w4$q`zAZ8rEE8GOU3e}lB;UHBMTU1 zv0~$ZBAw|O8Zat5u~)i$?gsXNgFLfHnwmgjc=3o2l!qKAHz3Z-WKEHSQ}mI$tof~> zdU1$ebhNLKmI#|hUUPvalOIFrdgDAwC;xmI#DNTSODXKAmldJ>N3w&jFaUAy9q7X4 zg2F7D#NXCyu?7(fe<|z< zQnz(4e}WP>hBl{!fY;0KyvI!g(2L7fy6o}olIB1zE{a&~60rfO7hgsvDy$O18%vI% z3RC!?6zxCKoWhYQ9_#t14ynye?RvxbB&17Q?_7XWK$)L!2^p|RLXTj8pTiQJ(2b%1 z_~8#wArmM%jX;__KBPbrW4OfJ0*Qi)usmUaG^Cq)fNvh-EL5JWO;-bQxM>j1D6-Vh zd}so2kd#vGx}}!N3M!O;4Jc-8;c9WHNN$C31@UB?Dx;THETam-3eM&H37MWGMWVp1 z;Qn$NkhCr~3C-6N4Y$7DoFha(n*g0^oTE*d$);qVP2lTU2?ZBLrM)mC>!kT(Q)J^K zJ3pu#q&m%*>9hk+yA>F*Ge$w>eY1E&aU_;gGzFebOxprUNr2wBw4xLc%b<5omt^hQ z4Ekgs^!m15EMmsyzaj0Wg;b=gy4RQP+}1)WBQ?*qC^rp*auF$8Xh7 z3F>Qdn5m#UGxGkkVp0oG+#3f>u+p{-sIlARuyFYhMrTSVX7JCiA!JAc$$1 zd_Zo)6)<)_0lB22Md5flIJFcjUh{0opaL@nn^4RlFEQVe8r0D%w?cy#xU#-w5De-9 z6Z|mYn(k65WmEGc99l+o`>2-I(FdbY#emBC(jgcQg({a}4X%EOnzVRzsZ?H~D~Y@z z@k8svrG;-)u`AW8f>7Q>_-c91#C-eOqve3x&&yF2}9Y*lx-N z8WbT-oXu5w8`ieGXsQ}PW+2-bt!?^|aNQPgZQk_ydIL+_V{W?dhdz0_86Sp0vscrN z?PM)oxec4GRcMpwE%4Bh-mXe5*!n~+R`m(n&r88PBvuT&TbgPd8#J*sLigP;WF8?r@>Ny5-d!Uxa zETVRwc7bwRj=t#aVxWVy%yxy!J=)p?XZof{Y50XXm%BEXZN^q1hAoD2oT7@EQ=>w)w#|~*AMj)Ahy%)Xm>n( zs3J|i@=o9?!Gl$NJ1K;D=%MWk)M3qX(fon#(#KSXT}p%lnKhLu6BOa(t*l%p_8xLY z--DLe)``1|<>cbVZ*1?H6rkN~4}95;j|-j0UFr4$y^5T2O`Z$cN~A|;jQ4m`$A|hP z0BSo-j&RDaJ!e|ZZXZ+GP4P&#DqSDs_EsIQAZMEuY!eakSTZA_m#9?pnEGOM2mMqFKrAj;@y+qovuNzR=T(n-kK9 zeR&DhvclYgsV~Bv?abBXL)#J947WkBT3WwiqI*{2h&Y@C>q{>qC8kl0aAdoGZd{mi zK{_dPIp(W@FP~tCqf6=W1t2~N(b@I3A@)C0l6zpH&r{~8NmoXiDVV0oz5691dE&U` z^LrUKKO7WI3;OgP$-rs5>jE0A8>QZMw%gX_IC2>luBCLiu>3QZfIu$mL*47HOw(;D zM^)~DlMnWlS&p?8KKJPDUO4+~ARG6a6K^V|ya#SuEJ zLtLg#z+s~%)P^v({(O4wlux55`@c0jT);?&WHNhoY~zuW1iXldQ*mb2%4)p_YPGGT zP#-(*nS{^eeSAfyqvAuC-UOUHzWr6J-o`x3XmStc_F&u0&deeS4MN0C;63=wydfwC z+dfG5nH$wqQY9xmNmQvF(|CmHy-nsc!v(bm)}6I6uxFkE86}w4H%L)=?%OT%JNyRb uY&3!gR1%vuD7RfVFz$c;+UNXVKll01U;CVY0{{U3{{sNgRw9m#Bme*>M)cqS literal 0 HcmV?d00001 diff --git a/data/library/viral/refseq/GCF_000865725.1_ViralMultiSegProj15521_genomic.fna.gz b/data/library/viral/refseq/GCF_000865725.1_ViralMultiSegProj15521_genomic.fna.gz new file mode 100644 index 0000000000000000000000000000000000000000..5f60e4660c5b51d11e669235475537f7509efcab GIT binary patch literal 4629 zcmV+w66)Uh;+%&X4`n|sI-g8b>WkyCs=KK90>p#Ez@~bbu`udme-~HkH|N84s zAHM(RU*7F^|M%VBfBMI}pY1RH^wWpG|MxUmdJg@!lbA@j|tlianKY#A-sl8|R^RK?!)B4f* zmajGXvgdp=&->5Ep1+*0Rdp`4`g!;B%C7U7=hf%GJ{Nkv(-rk~pBp?^>gT=BzjN+> zUeMR(Ue6oPW!LjW&x@n`^3A!*_AAe4d1}4;T;ciSyzRL{^O>`pnsZ(K{dvOk@z#5v zM;#X0TI9K??oj8`qbt_=q3(L__Wb=>@$&?8UVNUtm)5uSnA=O;!{zl{XFJDF>fO_K zbzXe_VVBQ#o^SeDGAE+T)_KCSZRdt(^Ueb|Tf22+y36^rmN-0>?sJ~=+}C^m%vjUd2GIt=ny#{PsJCXWBWz9OU?;(>&~)2hpmu zr|0Ea+Q}K#z{k$B7oRytz%eQ;*nYOX2;OrVXIr(K&-aV#YBz=j>lUTeZ0*N6&--}Y zET4xS2bvyIi&~}cN9#EM<&yf0KBd<;Q9>}D@E;u|amM;ahp-6Q;#snsM0<~dSsTR) zNeR7sJA_FsE#tZ3Qt%A9al(-2I#)bfpi3zlbF)#Jj&88Cm4va zb-jHzeRCb{yY^F$#cKe<;=H?GYp4w(C^k27zkMQ`9WQNcHrgL=#pWu~W|Xje_yqH` zkw%*{zI{6k^;qMZwbH4T;dQ`40SXqLMU=u^Fj+!}lNefKfgS^JahKj4AE|r}W%z*t zUoobb+PX>!B*y6Cup95(SyF*81SxT}96?}=mgE2=>VQdQgJQT7;KhSaJ5sl>=KNxG!E7xF}<_N$%_2d*sb)>;(f=&U#?z2aP zO@fjux4#ac%?BHBP@W7MTgq@3Fz45Er?ml6s@DksZng;;0-`GLeRfPCWjH(cB^9?L zeUa0gonhx(vYp+2g3=zB60sHet+N3(^~Zodi?SiaQ-)G*h!usAYo?F>T)MRYQj?Fi zOZl_3#q(-eMfSN&b01258zGCjxJfE%F}ci&>Qp`L4AH`t+(*k}>!d1`AW5zGBqEJ% zI_?GN8vrIbt!_D4*J~#X7Q{1em_^<_dke;#pI6zU65ES_~33D2p61I(53PB>Y zIbRBm(U>T9IxCVlFKOvP=a0psD;EtqN-7pcN2rKJ+mMfYPvWVh+aNL^FnP41p>BJz#>Vmlq36# zY64?4tbxI#F~m&AayD=IL9`kWQGmz_bgdz9n$(fAM*A=7zERr&4WQ07j06ottf zy4{ToaA3MTxR9-LaPW_P8N(Q8h;1X479Y5WYlt3v9id+Zdui(y^%G>a%0 zRO1SOMezP|^*u|riNS3q2X44QjmAZw3g6w#j@xnIO^fs0+eHHI)~aA3CyF&9pF8d= z|E&Rxc;|{M+;oh*oq~i4b&`gSmUU3UcVT=Ab(F}_s(RQqdh)$arY}yh79EPg0_o&J zb~jAdUr0j1Q8H~`W?ffOHoT(1=4}zEWr%KD)N0A*I&ic4f86Y=&*x@ef6_P8LTvAh z>jsJ;3hzQsTjQP0X;`ifV71NvaCq6ja62zyR@Io{L7BP ze8kCm|J%=JOTYRETiUlqx|XLPRKhGK*QEC|m_DW;U1_kSp;eJCLzE9LRk;*mFxey+ zixBNfMfk~C#Ff*RH#Kj?1tTPJJ8tW?qj||`Du0=k#E znX3mn+r`byMfV^KbPn!uNnP&O?#iQT=$E^5_Q_#EKEUmMqZ`;tMw^Dk4p`;d%zBnQ z-h10rZo*@*;hs|I2fMSaqTI6(iJ^hRv91cDSB!QSTgRqbyc4Ik;vr1DiVETm2+Q8_ zW{`iU0(C4}x1=w)?Td9-xd!I5l!a#3N!5-{qV5~LsQFM@J1@8BZ~Fpvk;FyPK`m_K zdNq&6&WrmeZsn!xxgDx6<6Lfn8woR!UriVqEDR^PI4hX+Nodff!9ZbKwkXna2@Suv zNRu$`1dSAXMQwSr?a~Rq7qxRo?~Gd~2n*Akw8EW57UyH;GiAEI{bYcFNL!IT&DcYGBBG2nX=_i+*qqZPw&PBxmEUqObCynIcB*)V4g+H< z_BXHt%Z^#4&^Da7EJ0bG%~p&U>+S4$7=wG59cM25y*rY=K2l+S_xTm}*PpDg9&GUb;G07)-P}Pw(1fm>szsGikYBGK)0jgcCp=WZK;t%=*LQXr>NU* zpqwE_CbI*|-Rr7A)zXpJ$TlCQf}Atmub!bL+P1??YI=IaWvOFwP}bL6$Hf5WklZQ# z9kbAlU}h>N!#;5fA{3URk>33{o0`YO3+eco>Bc(kk&aO(nMztT&L+ADEHbBXF{n%- zu~<)=j7@G_pODC>IuwpYyW^ym^vf@FI<}>t-5p27cBkG_N-<>jHthjDOgX1Iji>FH zh{mPwGZXvE;Q{R_KZ039g@UKMX*uU5@15QjYq2Zf;fk{bavozhR@$Z>k3ciPK&)aK z#h^X3B1c>1|K-^uS9J~L&|n7oE1KNHH149(mi8Exd$3^0s`!N&kJ*;CRtaSVn&yAq zL82gNxgh}MT&ZL{T}*dSa;G&wLt)?TE>u(pCvMB`913$yjtp1#x)OS4de8(k`U%@? z(&&HLr_23c^s+m^SJ?qjA#OB@iY%_^#IpNeA|7LMn?~lo1jc>^??qpjw8r%F)CWq% zurX1p-EL-29yZB9aoG3iw;M7rdNmc8{?YIU^joK2w{0#rmX>%S2{R{`4o;k{@#tX9 zggYMEQ?Fit&BC4j(|oiNee?O1=r^Bo#>Lj|TxGox^`41|svj%op<7R0AchV#J^6P+vP1g0xb=cWMjsnPtj*iT0&#mYQyTF`cFOK};>&1G+=yV%Rhu zU$w+8pRYc_z1!gOD2tq+c6hnH;;v2IPOq*07#x+hVVtAB?iZC~F+ws^FA~;vrwG?f zgy6!CTLhE2Qy8ywoz}S2=JP(sS-e2$x==gQfKzvFmrY@ur$Q<5``jeknQpj~o)a_5 zD!dXs(>d6X=FBSYwS(dP1{yO%RTyad_SfOFyG*S^d+Ig)yZqeh5bTVydi~cK1~eIy zwH29>KU&L^NrGCJD!O6S=*rBF*{?E^w9U)g24F9*ORtGiPyV3{ZyUnNF-abbQx|wA zsgKw*IVOiIlRUc&P}6B;+B^(9tjuxDPSc!*;#jQ;y%vL7c8NP^k1yw0G@P9@zIS}u z?pI4H<3$=b&ur-SB%2*gPa(w(yJs5Cf@xHq$h4QoWHVD}*86lb2txON?8Y%wV%N^j z7SmW4by4(gUBf(g%FU3q4Hxh9HgPdB5?r1Vfe&8-6P$CSl-k&7re|%VFeO|T;4z$>?Xc}OXDawA+;l7IOa(l|ATNqR6#T>l% zfuuTf!grG2HiSdg#;+%|3f5QlHE`_D{F3Kbnx1KOz(2NuA(ROqhb~30G8&iRW0c1v zA(&R*{Te2+cn{kNov~&b^E$*3MydMs=A?|4-Qfswr|<7CSje^YMtZk%IXLfI8M?Dl z@%Sc`*-eIh)y*|_?TMPQ?pXVsQE->+PEXh~xbwmkkNRV>q;37Cb9$%uR>P;ao3QtB zm}<|x%MF64H)s)&Y)NNrCL zc1U)+%V)bt2lwyCVZZ!*p!w$07Dps^?Q7E5!W`R~Z9x?@y#sruB&%GtHsd$#(J?`F zdwvI?&5<;d+r}lQqAdgwOS~jBqyZeqj)^>X63XG`)qwnM8`3e@YLDHK5x0IFn^hSa zWyWWkuxQiR{q{EpG7x(>3@yOU)IC!MMhW)F9e1TAj$~aGCDykkOBd9j9O0hdTk=F^{FCwTO)&7HaRHk~{&lE3Yg}m literal 0 HcmV?d00001 diff --git a/data/library/viral/refseq/GCF_009858895.2_ASM985889v3_genomic.fna.gz b/data/library/viral/refseq/GCF_009858895.2_ASM985889v3_genomic.fna.gz new file mode 100644 index 0000000000000000000000000000000000000000..73a8943db279d31b7f2396a9d523f5c0cb85a634 GIT binary patch literal 9591 zcmV--C5YM|iwFP!000001ALv$?rq6U#pm@D{Q!U>aff+Hu5<}F27DNi^sPiH5 z_MGziE!RnqNZ<6megEg|UAxMk%gf&X_|N%ofBX0U@DG3YtH1fx5C8JhpMLt~ryqX& z_dovOryqX#>G!|;^)EmE;pboe=ZD|_&u{%@^)G+?{SUwT z;n%``7>U$G`r&zxsUQzkm0epFV&2pFjQf^NkbT5mD8j-#aq9 zGxPp;M1Fqo`Lq82`FU1!e*Wpt59g1cFWvvXA1i$%;)H(k`KRvh`T6^tKW2RX|M^JW zKj!_x=l8mIoi*GW>W}la&$TP=LQdRI#w_Um_2)x(4Rb+V_kLv_;Qlo4t>%G73 z{qC3am`5M0S!3K!g`W5G@%r58e)B%#XCGRZo#(qByGx1r`Ya~yPb<&t#9hVQ;PY5@ zKUA~a`O{sauAN%p=a2W-eUMq;eUiEGy~SsRI+VMfyH#xF-MYJs*_xPD=iK-%SZ8#P z;jaH~(rm;$|L0J07FMBE-Ot`D->=osZl*jELNA z(T?9Qv11iy787?z>VEQWgO2d-{;YYo_`cc01-qbJx+Nkx?#OCFPPz+k;9y$w*Vu>vq31yTzRD2_6Cg~WvfvU-EwY!Fi#_GvwUf0YXhX>BPvL% z&Pj@OWoAT2%bzp2E6ll(pUMJN-UA)tmEf8WTS3Qi&5_>)ji!D6?&!k7`OSlja$Hj) zor=-xkqaGNjA0mQm+myrdqv?SyO*PqR-nd|#6H$ef21a-G zYQ#TS@<*_xB&o-|`Yz4ykmAq3G4yx_?WdfSEY#H=`NlrTYK>(dL779U=W50(%M8u4 z+#BP=MiWEcwXugKqt@~PF~93mV-d8ibB(!Jj@h04sF97?eVuz*u%(ovQe_JtT%xnr z8aI|WWLZOYHl_H|{JL_8cWZ6B3|!6MjG33lj*$B3tXXn^+N0*+&*}Z zL6MswD(aBil+#yCb}jmmmdM}81C&C(ah!xDqgx}(jzh!Isk|@IPmK$ha12cknSgl; zdCU1D51#%|aN^S9412!3f*sfmX^U4VbHsJV@QmuA0}6Akm@N;bXz<7a<4X$9FxFR+ zGV4mIYf4_^N@5(0q7IJbetj(Vgv98qbU$StOLyidrB2qeQTNd4C<41u^jTp_)Out> zR@Y80&?o=vw0riLS#QDL5Z_TB4U$Trni zdEz(ppJO1#5`~l=jn?)k2$A1C3U$l3oxdDqQ+(J@m~9snfCWM$JB;JM#(~3}=bsQs z(joUBjbVqc4^pfP-fNy9jhSn;#9tO<^9NELau!*tKQrTVOtY0E*w}iV%*4L> z=$Oc~)oP*%J-ikT3gL$yRC|AxiLaVsj8%8e5;m*wiK&J%|rPup^M@=B@W5$#(bd#gayvWu)&iU zj2LwiDq>|rI-@W|L3rIq#;o}&n`t8Lgvl~_$$=n1m}!N)NYzn-Ft=Y5t4AS=AdhH# zyACvQHquB1A@iL{E7ISQrF4E8ShI-E+$!BdDfj)B{_v`8MIGbo#)~yUDqcWd0=K3pE0qDIt&yH&f(#k6 zmW3aSt#k+sOA!OeSE9KnC0>pbzY&?q-N4g{8kG4YC9+;rK5>*8=Hf0e5I$ct?NS~ zBm-c>WjT*<4r+B8KO>_ie5*J>!m~JQGQ;sY!Q+<;Qp_KY0rf28(uD6xYIgU~Hop*) zavY>Mp`2S0%7?NKZQHZ)__dq(VD2|dQaA#>34$e0DNop>klcP&X@E&$v#0#FSurHp zQzh|aw0ELNKni2mqbD;Cj&~d;zdww zH$&bK?Axh)fu*u0L&n0a_yQ)P7b10%D>Ekvz0Y{sNxAkRiv)@oq`++MZcdgia^`W^aWdhE9+uCt^_{L3M?p zWohkN$yn?sS+XAMWY}+TVd=17BycqC92vm)h9mQ<%gZ1~Q3e>OMpxI#3J`0c5uB=2 zPX_R2qfXai!GGzhWwTa`xS|56#!L62IMKsh%qVC>jv&weo{#>V1%`*ZW(bxG|y!brRSW`kE->$?>vtob1+*#rS7 zQt4+Z*Q#FNH=28Cz5FN<>;&tj;Pp0WWNvMlKc&M1X^qFi>_O_o+D97OY%TRdQf!6UPh65Xlo${zcT3 zr%4M(Q&P$5({)g~1PD^e31q>5F8{n@k1*0apia1+-rAtBp>|~iNP~EF!N~xh-xW%* z6`Qs4H4t$2Aj6PofjzBpQLd1bTV+M%mknu7%4FCUCk;MCcsgDsaC%>oq;yWfl7$c? zRZm=KzH`(h5?c%C3U3(SBEF5A-a14X$ZB99Dt%2ZDtK|tfx_u6zf?PK9BDZ!iX_R4 zv5r=`ZYvWQ7R8Ram5{BFC+SG$NfrBz84(DQ{oXxIL9(RdB@RSUuj+KR1rpKhVU|nv zU|`BDvR%6!EX(;taX6Z9!CZ0cZB(Nx;B!JlhKs-O2zJKAoSqyjrl*^#`b zsnHr^P`n}KdQjyA?OS;iSe9w%T>b#uoe&nhO5V(MJv{GZLN|g{^lwBUUXijX?Ir1N zVH#9updvzjjV$4F$`}kK2QQU*bpSw4Bd^45&%(D0l(RFZ$%9IujxqppI#rMHi(97U zlx-hd&!^b^s$Q;dQl(t6BM2D9%1nfUsNn0;cGzY`9Q5A^Ks9Wa<+cEv+UhvID69)n zhG3KnndW8yq)SA(3NES^QfQ~S3QZ{%67Q-IGO8!=FrwEW+f0C@fJ?VR!d`lcwptN@ zlv@NI75>sy2ko^ZC2}X%j_nm1z~A~zx&<6!HO4$N>XujodF7kPBV>|@z1nj)9u zlHCpxT#0tWnu7KtDWH;<=88z*6n&M6FcvN%AM--c$*^77z;=^(&~hOPH=}1Gc#wBT zYIVk3ii+;DVSrZxUz^0V zCLQ`wxOD-aisJ0tY4h`v$Vvnov&pv718XWb=fg)@M8{%H&>qGYd#OrgP82nSG@KvS zM}pW02dIXO`Uol|R25_=H=(!A1)jBXXE`Z&8->n6Aqd_!V0 zM|&N)MMu64VKxTt5;$=Wmfpn`c(4$gz%BhKI+ql+U*1;45?cgYPX#SiQo86{+2#g9*y~!3`urL>jJ+i-`UB@tvMs~j-(j*WN_AXD!sNvut z`ODuL?Z-wM}B~oJ`X~ir1J3=|I4q`b8Tdy+O z_&OTaQ-HZH5s$P~k%OQ!^uo5VxASRBkpwZZeHz>OwMHt#$dZ#BBqzS2d;^NQm8(G2 z%OeG}#v<2{F^U_A=ADIwLic5r;<2s9t-R-k^HMjW-Q@0y~28}&`nrZGEU zN_H4z=Lkxqees4o7!IxQae{k4;JiZ}8L#^#T|-NsUY7xhan7e#!z;{ibyq-}kJqmb z>V3e|>6r$>S2}fEanIVM$;AF9PWnG=ZX0gLq-C70MdOz{cjduel=A23Y0FRLdT&}q z@>yI+yOT+&@y#UbZOaQPT&8V7qc>-a>IlVa!T%+Kx16BK2Grx|^|J3v7Xz1yP^z)X zvn*zwhvrzyvP97O0t8E{53%e>S-)u{k^-27lOmtA3&BE&!q^)m({f=9joxk_qpWK{ zTO}j_2y6t{Tx^pDmJ#xY!*%?$S)8!nrnka1sEBQ;so~z1Q+up4!|CeikqTqiT&|JgnGw!;|pL`Y#?+?nHk%n zvyi2qEB(9(%Cp#fP(=rdBr$u~(3RV~P9xO9Vc;|65^_NzEIVI0%RJ%{BSu+ZrLlNN z1Q|xSNwwB2Ekgl%HO6X;gSw}SG?#cW-RVhaWYEBg9|dw!YP)>(1t=jFWqdqreh0E@ z_RU(L-Q6cD^x)Q=yCK*C4|9NJ7-!DMU7a^ioxN64243P?=jd{8{e++%j{+ZFfffcQ zK(8|vij=W4yK{hXS9xrk!vGT!ZJB_y8=DX$ZO;y9OEms5hFd{4fY>W)8Jk_W>YlEG zZ~N-jvxl=}i=D1B$2N&3WJ>t1ZL-A+r?wb< zb-5G?xL-t?V+vv0;~Tf{FlusQ`Ma>mdiTy$>}_5_jAPwuxO8X|%_A9JGnrI7@x>m^rsen!4R#yi_t<=0ZFBhTatm-hw-l9ezPs;&aGYwR=cg^7`X^Hkk-mXn+6cX z?F9iV%S`n)$tT^=c*3a~%+}br3&-bn2iln2!v2mCs+y)nOjoAc{&=p0=VAH@Xc&?q z_s=f5cfZ*Mh_zv^V^OA&L7w~-Hq~&Wv|1B8qQA1Lc4M50wWew>;o8>eJ5*hubLn8) zAmpZ#arCb7o1vvpf>%L5 zP*}Hd9oz1>=vjPGzujJ=-rL-uyxql=aNh}WHta}vt=rt609{NiIh3l&H@2UgPKK>l z>TDmwcgIfPc#MD!eweZR6xIG|BM0uKK|vJ}Y6LiV#yRXDhN1Dh@hKhMf~zX9ag(3P zgA@njiy@3}0a!a<{iW+$1^xB&$Mt&Xx|Rv}O5=t{)d2WLh7@su*}aHymf=?N{S0?- zyhP(&DjqBffc1t2u84dbCi+z#U@ zDJDiq_#rWI_zNP3krP#g4E+_>fvfQXhcWh0s zSsI349=|dTTLf;`i38 z^krnmb|=-2`)T)Si$o;~i#uOljLm3+?Jk0VW#Rzq%&j_Z`(!R-3+t2Tt4W|2;zDO! zTvv~cY0T+9o8H(Vth0lCiNY(BLwF>YwuaqR3zsX17yBUwSdl5IXQ%TyY$7raY?RBB z!LaTO3yc$Aq;2~|ZfmYHr!Aky7F7G2{fl?!n-$;H;OX?QHZ5>RlsNNaMY-i$5Y8u_ zy}dqrDCt0nRlwzg%c$a;Opy`zC8U;zpM*&(llZiol^t?{o8*$p2oyXr$hqCC9DoDD zx9NI-CB$)8Lo(fSpJ3Aje7z$KzF7U!1^*ys5H!&X#HI~%1WA!kC2kL7J z%cjU(=0h#OZp2MBw#6D`alRV_&tD#&J6SoXokszW!@P!AN!xBe;SUG&(XEL)cgNTn zh`VbC%xrU7ZmPE^95rGCpxbc6%wS@IN3G^L#}xX7A;(95LFA{urj#uu3xGde?XIP) z5ssT6BN;O5t=xrVL(N%2G;~*s36M*iBdSnMK22=IZXDXY$_Yt0iRHI#{iIjSG-F6A z6UYw+i{YD z?hSoryM|jkjs%JGq#M`U(@Av1Yf=+^Rb!DFa-r}lIHsQVveUx|sk3()h{9RMEr9Wz z+7X-{C6Xo}S9!b|JW1$9&=}qT3edwj_oIXn#3ntXt33`2PSSF}b*yTFVI&w99)onK zV!4sNUa_%W|NN4~as9RQk&QzN&ihe^eR%juXYV8i_fICim!5+iE*Q!r;?YlmOk;=h zrHdOCThC>FDWf#8)v-zI*X$KxUv1)iZau>zFx(CjJpC6h(mgEUaRYP&%fRk*uVoH- z&L)OiNj5Y-S`wSVqmR?CT>QA*$o9ntYNwK(X3gy}_lDRHJ=^+-_23scH<&9dAX!S{SEgmt%66*R^0# z8x)9kNLhcji4)sIJhj_#7{RE47mQgBvfNko28XqG>~ck3Z?ieqqc>1~eQF^HZP6lOy}^*iP? z)9*(024;p*wAW9ZP8q^H=q>+ws?q@thkxvLqSphA544W-&bHaf2Qs2AshoNo`*2U) zcCkAS^dPxK1QMQ~TL-8ij|=EDC27%dPxaWgpmIj1jA-ph8wu(W>7Zv3ow?8G)3X@%ybBN_SH0GqD*d}C1q#N6cG9Vzm`X1T;XYFjHR##^ zHe#EFdq=44*qdXhFkUXg1X7ofJzFW=k3c3WTF+)*jO936Yk{)M;7GL1!)H7Yl(vjj z0n%_cg{bGTHbk(tVJLFDot7wL#u1YgGxI^XiYShGdg77e!1Al#&EfqLfaqorGU=kO zOkY**%6c=6^Z@RDoRCwG3%{_XOC+9_{1iz>oTS2)t&W9#S|xR| zPlj|XeGGs4qWV+_+fr{oLghx2VPSD=Q2M^3MFl**zl0x{wL%kKTb5YFMAc)?kp6!Ut-qEnk zd!_%#HJAe;N6#Qty>C7N^FiQc)y{$zHfCHaEg>iVtUIN)fpJHcdahSHHa1}y)|`10 zysklM1+%mm>F*vFNEaqm`@IH~xR;po_)C*k0Er{B;g5+wK^DTPZeR9+zziXJVd*xzyxYPJAKEIb`FT4 zn${Agc4jkWGG=m^5N;SYU5NV{=r!d|O$zm-^#`^=?o;`VVBM!c*1-t>T0Y5OLg#B7 zTD`g$mbLOJB|X)wJ|4)FG`|TLK3~lD>>Xye1Upgt8PMxj0MOxEVDg$e0NvyOVmjmR zSRxHW-Flh9HYsi4dowf*%3!f~q(t!k3OW6DRgYJ0q&qrOtXxl@fHkNf{iMekEaN)e z#9{??`>%%IGqN54kvjHv1u?6Nq3wRVLAhzuq@=y_8vqwyE&KV}GxO{Sj`H;I%5*}> z%TJaF?@6|~o#4@L(j9~B+_qEO&`tN4fb8-!n(O4!HwJ#23|Z8Peq9HW1O*ViEoekL zCIEY^3muS7GRz%4;VFBbo6;r-7?}ep>}_!O+Ysn}+J4aSVaIl1+h=!vbGTcS=^kZ6D)7dR?h7%}vyz$XG=+T-&2F$S)bx2#*D3WrlO01=>DOD7 zQ-7Be)k-wUBVeQ(?DfKz*p}=a4CLRkEIV!C7jN-y;_;Mc&xv8;H_=PFtPdS-vAFj# z2QhNZ){~>(+X>Z|*3^=R-7lE!!E>F4G?P|h8))vhv-s4b>8zs+Q7DH_|Z+ZnD6@ofUXOCuZ8Bt#G zYz$HT)ssiTez**^rQRvMqys%4!wz(qzdlUg^`t9L&$0 zh5-%`R6-BG>UIGOw)`p!*llSJV#c;#Jua*+dCYSO&mP_Khgq(4Vz{*5J4ebRNfiW%h=urMdQ)#e%B zlSnD;xed}LJeIKF;U4MXhx-O41_04VPgnGNJA*N)oqFttQ#R6ma{!3in8L}pPk2%X zrk7qw52*o_gN^Hgz5T)KuY{&|CqcoF=M7*=`b{%T1{n4B*)no|TUK@qyo0@C^Orju z(lJTWa%n&#T;>0>_h0OUSkd|Q${1L>wEB4>@^*|ibsx|8%B@vwl(>DS3@QPX0F=6Duh!8B+}c7U<|E(B&lIyq=R+wu_IxjC=-!<0NH}U+gFRJbxjsq2 zz1)QzK|?&3Oy7~CJiUWW9FlAe>MO5=G&wD;$|C_oyUZj9kfw6tBY?xd-XE4?3~I`yJy7h#E5EwUc(XeM z%d}|wb-WJm6uYNPYUELQ9zfPy&V3z_4%=l4-o^3;mGNF34Fh~L>v#)^k>fX5QElyh zDG-l=PC-o;t@6a9W~CeE)O1|i0S&FJ;>{*Zze_;_*de4a&p~g@NmB=~oat9Zv~fx} zJZr1AkSIJvQgQSKI9eU zT;jIZ^-DJVo&mpe3F^dS6s1YAOhDunCWoAw+8*9DP!(pRwUe;+RMYNFI+8{c@NyNb zw^sl@OgUC7-o==HX#^G$Qhq6E)SEaI6&(-6-6l6)Te@xB Date: Sun, 23 Jun 2024 12:16:29 +0800 Subject: [PATCH 18/18] readme --- README.md | 539 ++++++++++++-------------------------------------- cal_memory.sh | 7 + kr2r.sh | 26 +-- 3 files changed, 140 insertions(+), 432 deletions(-) create mode 100644 cal_memory.sh diff --git a/README.md b/README.md index d276f9a..830805e 100644 --- a/README.md +++ b/README.md @@ -1,237 +1,134 @@ -# kraken2-rust +This workspace contains two projects: `kr2r` and `ncbi`. The `kr2r` project includes an example that demonstrates how to use the `kun_peng` binary, a tool for processing gene classification, to build a database and process a sample file. -## 0.Installation Instructions -To install kraken2-rust, follow these steps: +## Get Started -1. Download the appropriate version for your system: +Follow these steps to build the projects and run the example. -Navigate to the Releases page of the kraken2-rust GitHub repository. -Select the release suitable for your operating system. For example, if you are using CentOS 7, download kraken-rust-${VERSION}-centos7.tar.gz, where ${VERSION} is the version number of the release you wish to install. +### Build the Projects -2. Extract the downloaded archive: +First, ensure that both projects are built. You can do this by running the following command from the root of the workspace: -Open a terminal. -Use the tar command to extract the files from the archive - -```bash -tar -xvf kraken-rust-${VERSION}-centos7.tar.gz +```sh +cargo build --release ``` -## 1. NCBI download tool - -Downloading Genome Data with NCBI Tool -The ncbi command-line tool offers functionality to download genome data from the NCBI database. This can be done for various groups including archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other and invertebrate. - - -## Key Features -* Resumable Downloads: The tool supports breakpoint resumption, allowing downloads to pause and resume without starting over. This is particularly useful for large files or in conditions of unstable network connections. - -* Incremental Download: Users can perform incremental downloads, where only new or updated files in the directory are downloaded. This saves time and bandwidth by avoiding redundant downloads of previously obtained data. +This will build the kr2r and ncbi project in release mode. -* Automatic MD5 Verification: To ensure data integrity, the tool automatically verifies the MD5 checksum of downloaded files. This step confirms that the files are not corrupted or tampered with during the download process. +### Run the `kun_peng` Example -### Genomes Command -To download genome data, use the genomes command with the following syntax: +Next, run the example script that demonstrates how to use the kun_peng binary. Execute the following command from the root of the workspace: -```bash -ncbi genomes [OPTIONS] --group [COMMAND] +```sh +cargo run --release --example build_and_classify --package kr2r ``` -### Subcommands +This will run the build_and_classify.rs example located in the kr2r project's examples directory. -* md5: Checks the md5 of the file only. -* fna: Parses genomic files and generates library fna files. -* assembly: Downloads and parses assembly files only. -* url: Downloads genomic files from a specified URL address. -* help: Print this message or the help of the given subcommand(s). -### Options -* --site : Choose the NCBI site directory to download from (RefSeq or GenBank). Defaults to refseq. Possible values are: -*genbank*: Download genbank resources. -*refseq*: Download refseq resources. -*all*: Download genbank and refseq resources. +Example Output +You should see output similar to the following: -* --asm-level : Set the assembly level for the download. Default is `basic`. ["Complete Genome", "Chromosome"]. `all` is ["Complete Genome", "Chromosome", "Scaffold", "Contig"]. -* -g, --group : Specifies the category of data to download from the NCBI site. The group can be one or a comma-separated list of the following: archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other, invertebrate. -* -h, --help: Print help information (for a summary, use '-h'). +```txt +Executing command: /path/to/workspace/target/release/kun_peng build --download-dir data/ --db test_database +kun_peng build output: [build output here] +kun_peng build error: [any build errors here] -### Examples - -To download genome data for bacteria from RefSeq: - -```bash -ncbi genomes --group bacteria --site refseq +Executing command: /path/to/workspace/target/release/kun_peng direct --db test_database data/COVID_19.fa +kun_peng direct output: [direct output here] +kun_peng direct error: [any direct errors here] ``` -To check the md5 of genomic files for fungi: -```bash -ncbi genomes --group fungi md5 -``` +This output confirms that the kun_peng commands were executed successfully and the files were processed as expected. -For more detailed help on a specific command, you can use the help subcommand: -```bash -ncbi help genomes -``` -This tool simplifies the process of downloading and processing genome data from NCBI, making it accessible for various research and analysis purposes. +Run the `ncbi` Example +Run the example script in the ncbi project to download the necessary files. Execute the following command from the root of the workspace: - -### Generate fna file - -```bash -ncbi gen --site all -g archaea fna +```sh +cargo run --release --example run_download --package ncbi ``` +This will run the run_download.rs example located in the ncbi project's examples directory. The script will: -## 2 Squid Tool - -Squid is a versatile command-line tool designed for the efficient processing and classification of biological sequences. With its suite of functionalities, Squid facilitates various tasks related to sequence analysis, taxonomy resolution, and database management, making it an essential utility for bioinformatics workflows. - -### Features -Squid offers a wide range of commands, each tailored for specific aspects of sequence data processing: - -* estimate: Estimate the capacity requirements for database construction or analysis, aiding in resource planning. -* seqid2taxid: Generate a mapping file from sequence identifiers to taxonomic IDs, facilitating the association of sequences with their respective taxonomic lineage. -* build: Construct a Squid database from a collection of sequences, optimizing it for subsequent analysis tasks. -* hashshard: Divide a hash file into smaller, more manageable shards, improving the efficiency of data processing. -* splitr: Split FASTQ or FASTA files into ranges based on sequence identifiers or other criteria, aiding in the parallel processing of large datasets. -* annotate: Annotate a set of sequences with taxonomic or other relevant information, enriching the dataset for further analysis. -* resolve: Resolve the taxonomic tree for a set of sequences, identifying their positions within the taxonomic hierarchy. -* classify: A comprehensive workflow that integrates splitr, annotate, and resolve into a unified process for the classification of sequence data. This command streamlines the task of * assigning taxonomic classifications to sequences. - -### Getting Started -To get started with Squid, you can invoke the tool with the -h or --help option to display detailed help messages for each command: - - -```bash -./kun_peng -h -Usage: kun_peng +1. Ensure the necessary directories exist. +2. Download the required files using the ncbi binary with the following commands: + * ./target/release/ncbi -d downloads gen -g archaea + * ./target/release/ncbi -d downloads tax -Commands: - estimate estimate capacity - seqid2taxid seqid to taxid map file - build build database - hashshard split hash file - splitr Split fast(q/a) file into ranges - annotate annotate a set of sequences - resolve resolve taxonomy tree - classify Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences - help Print this message or the help of the given subcommand(s) -Options: - -h, --help Print help - -V, --version Print version -``` - -This will provide you with an overview of all available commands and options. For specific information about a subcommand, use: +Example Output +You should see output similar to the following: +```txt +Executing command: /path/to/workspace/target/release/ncbi -d /path/to/workspace/downloads gen -g archaea +NCBI binary output: [download output here] -```bash -./kun_peng -h +Executing command: /path/to/workspace/target/release/ncbi -d /path/to/workspace/downloads tax +NCBI binary output: [download output here] ``` -Replace with the name of the subcommand for which you need detailed help, such as estimate, build, or classify. -## 2.1 Seqid2taxid Tool +## ncbi tool -The seqid2taxid tool is a utility within the kr2r package designed to facilitate the mapping of sequence identifiers (seqid) to taxonomic identifiers (taxid). This tool is essential for researchers and bioinformaticians working with genomic data, enabling them to easily relate sequence data to taxonomic information. +The ncbi binary is used to download resources from the NCBI website. Here is the help manual for the ncbi binary: +```sh +./target/release/ncbi -h +ncbi download resource -### Usage +Usage: ncbi [OPTIONS] -```bash -kun_peng seqid2taxid -h - -Usage: kun_peng seqid2taxid [OPTIONS] --source +Commands: + taxonomy Download taxonomy files from NCBI (alias: tax) + genomes Download genomes data from NCBI (alias: gen) + help Print this message or the help of the given subcommand(s) Options: - --source the database directory - -f, --map-file seqid2taxid.map file path, default = $source/seqid2taxid.map - -h, --help Print help - -V, --version Print version - -``` - -To use the seqid2taxid tool, execute it with the required and optional arguments as follows: - -```bash -kun_peng seqid2taxid [OPTIONS] --source + -d, --download-dir Directory to store downloaded files [default: lib] + -n, --num-threads Number of threads to use for downloading [default: 20] + -h, --help Print help (see more with '--help') + -V, --version Print version ``` -### Required Options -* --source : Specifies the database directory containing the sequence and taxonomic data. - -### Optional Options -* -f, --map-file : Path to the seqid2taxid.map file. If not specified, the tool defaults to using $source/seqid2taxid.map, where $source is the path provided by the * --source option. -* -h, --help: Displays help information about the tool and its options. -* -V, --version: Prints the version of the seqid2taxid tool. - -### Example Command -To run the seqid2taxid tool with a specific source directory: - -```bash -kun_peng seqid2taxid --source /path/to/database -``` -To specify a custom map file path: +## kun_peng tool -```bash -kun_peng seqid2taxid --source /path/to/database -f /path/to/custom/seqid2taxid.map -``` - -## 2.2 Estimate Capacity Tool - -The estimate_capacity tool is designed for estimating the capacity required for building a database from genomic data. It takes into consideration various parameters related to the genomic data processing, such as k-mer length, minimizer length, and hash table load factor, to provide an efficient estimate of the necessary resources. - -### Usage - -To use the estimate_capacity tool, execute it from the command line with the desired options: +```sh +Usage: kun_peng -```bash -kun_peng estimate_capacity [OPTIONS] -``` +Commands: + estimate estimate capacity + build build `k2d` files + hashshard split hash file + splitr Split fast(q/a) file into ranges + annotate annotate a set of sequences + resolve resolve taxonomy tree + classify Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences + direct Directly load all hash tables for classification annotation + merge-fna A tool for processing genomic files + help Print this message or the help of the given subcommand(s) -Options -* --source : Specifies the build database directory or file. Default is lib. -* --cache: Estimates capacity from cache if exists. -* -k, --k-mer : Sets the length of k-mers. K must be a positive integer (default is 35). K cannot be less than L. -* -l, --l-mer : Sets the length of minimizers. L must be between 1 and 31 (default is 31). -* -n, --n : Sets the maximum qualifying hash code (default is 4). -* --minimizer-spaces : Specifies the number of characters in the minimizer that are ignored in comparisons (default is 7). -* -T, --toggle-mask : Defines the minimizer ordering toggle mask. -* --load-factor : Sets the proportion of the hash table to be populated (only for build task; default is 0.7, must be between 0 and 1). -* -p, --threads : Specifies the number of threads to use (default is 10). -* -h, --help: Prints help information (for more details, use '--help'). -* -V, --version: Prints the version of the tool. - -### Example - -```bash -kun_peng estimate_capacity -k 35 -l 31 --source /data/ncbi/path -p 10 --load-factor 0.7 +Options: + -h, --help Print help + -V, --version Print version ``` -### Output - -```bash -estimate count: 1213069985, required capacity: 1732968825.0, Estimated hash table requirement: 6.46GB -``` +### build database -## 2.3 build +Build the kun_peng database like Kraken2, specifying the directory for the data files downloaded from NCBI, as well as the database directory. -```bash -./kun_peng build -h +```sh +./target/release/kun_peng build -h build database -Usage: kun_peng build [OPTIONS] --source -H -o -t -m --ncbi-taxonomy-directory --required-capacity --chunk-dir +Usage: kun_peng build [OPTIONS] --download-dir --db Options: - --source - build database directory or file - -H - Kraken 2 hash table filename - -o - Kraken 2 options filename + -d, --download-dir + Directory to store downloaded files + --db + ncbi library fna database directory -k, --k-mer Set length of k-mers, k must be positive integer, k=35, k cannot be less than l [default: 35] -l, --l-mer @@ -245,19 +142,7 @@ Options: -r, --requested-bits-for-taxid Bit storage requested for taxid 0 <= r < 31 [default: 0] -p, --threads - Number of threads [default: 4] - -t - Kraken 2 taxonomy filename - -m - Sequence ID to taxon map filename - -n, --ncbi-taxonomy-directory - NCBI taxonomy directory name - -c, --required-capacity - - --chunk-dir - chunk directory - --chunk-size - chunk size 1-4(GB) [default: 1073741824] + Number of threads [default: 10] --cache estimate capacity from cache if exists --max-n @@ -270,92 +155,46 @@ Options: Print version ``` -## 2.4 hashshard -```bash -./kun_peng hashshard -h -split hash file +### classify -Usage: kun_peng hashshard [OPTIONS] --db +The classification process is divided into three modes: -Options: - --db The database directory for the Kraken 2 index. contains index file(hash.k2d opts.k2d taxo.k2d) - --hash-dir database hash chunk directory and other files - --hash-capacity default: 1073741824(capacity 1G = file size 4G) - -h, --help Print help (see more with '--help') - -V, --version Print version -``` +1. Direct Processing Mode: -## 2.5 splitr +* Description: In this mode, all database files are loaded simultaneously, which requires a significant amount of memory. Before running this mode, you need to check the total size of hash_*.k2d files in the database directory using the provided script (bash cal_memory.sh out_dir). Ensure that your available memory meets or exceeds this size. +* Characteristics: + * High memory requirements + * Fast performance -```bash -./kun_peng splitr -h -Split fast(q/a) file into ranges +```sh +./target/release/kun_peng direct -h +Directly load all hash tables for classification annotation -Usage: kun_peng splitr [OPTIONS] --hash-dir --chunk-dir [INPUT_FILES]... +Usage: kun_peng direct [OPTIONS] --db [INPUT_FILES]... Arguments: [INPUT_FILES]... A list of input file paths (FASTA/FASTQ) to be processed by the classify program Options: - --hash-dir + --db database hash chunk directory and other files -P, --paired-end-processing Enable paired-end processing -S, --single-file-pairs Process pairs with mates in the same file -Q, --minimum-quality-score - Minimum quality score for FASTQ data, default is 0 [default: 0] - -p, --num-threads - The number of threads to use, default is 1 [default: 10] - --chunk-dir - chunk directory - -h, --help - Print help (see more with '--help') - -V, --version - Print version -``` - -## 2.6 annotate - -```bash -annotate a set of sequences - -Usage: kun_peng annotate [OPTIONS] --hash-dir --chunk-dir - -Options: - --hash-dir database hash chunk directory and other files - --chunk-dir The file path for the Kraken 2 options. chunk directory - --batch-size 批量处理大小 default: 8MB [default: 8388608] - -h, --help Print help (see more with '--help') - -V, --version Print version -``` - - -## 2.7 resolve - -```bash -resolve taxonomy tree - -Usage: kun_peng resolve [OPTIONS] --hash-dir --chunk-dir - -Options: - --hash-dir - database hash chunk directory and other files - --chunk-dir - chunk directory - --full-output - output file contains all unclassified seq + Minimum quality score for FASTQ data [default: 0] -T, --confidence-threshold - Confidence score threshold, default is 0.0 [default: 0] + Confidence score threshold [default: 0] -K, --report-kmer-data In comb. w/ -R, provide minimizer information in report -z, --report-zero-counts In comb. w/ -R, report taxa w/ 0 count -g, --minimum-hit-groups The minimum number of hit groups needed for a call [default: 2] - --batch-size - 批量处理大小 default: 8MB [default: 8388608] + -p, --num-threads + The number of threads to use [default: 10] --output-dir File path for outputting normal Kraken output -h, --help @@ -364,39 +203,46 @@ Options: Print version ``` +2. Chunk Processing Mode: -## 2.8 classify +* Description: This mode processes the sample data in chunks, loading only a small portion of the database files at a time. This reduces the memory requirements, needing a minimum of 4GB of memory plus the size of one pair of sample files. +* Characteristics: + * Low memory consumption + * Slower performance compared to Direct Processing Mode -```bash -./kun_peng classify -h + +```sh +./target/release/kun_peng classify -h Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences -Usage: kun_peng classify [OPTIONS] --hash-dir --chunk-dir [INPUT_FILES]... +Usage: kun_peng classify [OPTIONS] --db --chunk-dir [INPUT_FILES]... Arguments: [INPUT_FILES]... A list of input file paths (FASTA/FASTQ) to be processed by the classify program Options: - --hash-dir - database hash chunk directory and other files + --db + + --chunk-dir + chunk directory + --output-dir + File path for outputting normal Kraken output -P, --paired-end-processing Enable paired-end processing -S, --single-file-pairs Process pairs with mates in the same file -Q, --minimum-quality-score - Minimum quality score for FASTQ data, default is 0 [default: 0] + Minimum quality score for FASTQ data [default: 0] -p, --num-threads - The number of threads to use, default is 1 [default: 10] - --chunk-dir - chunk directory + The number of threads to use [default: 10] --batch-size - 批量处理大小 default: 8MB [default: 8388608] + 批量处理大小 default: 16MB [default: 16777216] -T, --confidence-threshold - Confidence score threshold, default is 0.0 [default: 0] + Confidence score threshold [default: 0] -g, --minimum-hit-groups The minimum number of hit groups needed for a call [default: 2] - --output-dir - File path for outputting normal Kraken output + --kraken-db-type + Enables use of a Kraken 2 compatible shared database -K, --report-kmer-data In comb. w/ -R, provide minimizer information in report -z, --report-zero-counts @@ -409,149 +255,10 @@ Options: Print version ``` -## 3. build_k2_db - -The build_k2_db command-line tool facilitates the construction of a Kraken 2 database. It requires specific filenames for the hash table, taxonomy, and the sequence ID to taxon map, among other parameters. - - -### Introduction -The build_k2_db tool introduces a novel approach to constructing Kraken 2-compatible databases, specifically addressing the challenges associated with the large memory requirements of previous methods. This documentation outlines the process flow, working principles, and the inherent advantages of using the build_k2_db tool for genomic database construction. - -The build_k2_db tool revolutionizes the process of building genomic databases for Kraken 2 by introducing a novel, two-step approach to database construction. This method significantly mitigates the challenges associated with the large memory requirements of traditional database building processes, particularly vital for constructing databases like the NCBI RefSeq, which are substantial in size. - -### Working Principle -#### Step 1: Preprocessing and Generation of k2 Formatted Files -Initially, the tool preprocesses .fna files to generate intermediary files in a k2 format. This step involves scanning the .fna files to extract relevant k-mer and minimizer information, mapping these to taxonomic IDs, and then hashing these elements to produce indexed intermediary data. These intermediary files are crucial for the next step of the process, as they contain indexed positions and taxonomic IDs necessary for constructing the hash table efficiently. - -#### Step 2: Iterative Construction of the Hash Table -In the second phase, the tool iteratively processes the k2 formatted intermediary files to build segments of the hash table. This method involves reading the intermediary files in batches, resolving any taxonomic ID conflicts using a Lowest Common Ancestor (LCA) algorithm, and updating the hash table with the resolved IDs. This step-by-step processing significantly reduces the memory footprint compared to loading the entire hash table into memory at once. - -#### Efficiency and Advantages -The build_k2_db tool introduces several advantages over traditional database building methods: - -* Memory Efficiency: By generating intermediary files and processing these in chunks, the tool drastically reduces the required memory, enabling the construction of large databases on systems with limited memory capacity. -* Scalability: The approach is highly scalable, thanks to parallel processing and efficient handling of large .fna files, making it suitable for building extensive databases. -Time Efficiency: Despite the intermediary files being substantially larger than the final hash table, the overall time taken to build the database is comparable to methods that process all data at once. -Performance Insights -In a performance test involving the NCBI RefSeq database, approximately 500GB of .fna files were processed to generate 850GB of k2 intermediary files. The final hash table size amounted to 188GB. Utilizing a machine equipped with a 16-core CPU and 32GB of memory, the entire database construction process was completed in just 9 hours and 42 minutes. This showcases the tool's ability to handle large datasets efficiently, both in terms of time and hardware resource requirements. - -#### Comparative Analysis with Kraken 2 C++ Version in Fast Mode - -In addition to the innovative build_k2_db Rust-based tool, it's informative to compare its performance and resource utilization with that of the traditional Kraken 2 C++ version, particularly in its fast mode operation. Such a comparison underscores the advancements and efficiencies introduced by the Rust implementation. - -#### Kraken 2 C++ Version in Fast Mode: -For processing the same dataset from the NCBI RefSeq database (~500GB of .fna files), the Kraken 2 C++ version in fast mode presents the following resource requirements and performance metrics: - -CPU and Memory Usage: Requires a machine with a 16-core CPU and 200GB of memory, indicating a significantly higher demand for memory resources compared to the Rust-based build_k2_db tool. -Time Efficiency: Completes the database construction process in approximately 9 hours and 32 minutes. This duration is slightly shorter than that of the build_k2_db tool but at the cost of substantially higher memory requirements. - - -#### Key Insights and Implications: -Memory Optimization: The build_k2_db tool demonstrates exceptional memory efficiency by requiring only 32GB of memory to process and construct a database from a large genomic dataset. In contrast, the C++ version's fast mode requires 200GB of memory, highlighting the Rust-based tool's optimization in memory usage. -Comparable Time Efficiency: Despite the vast difference in memory consumption, the time taken to build the database is remarkably similar between the two tools, with the Rust version completing the task in 9 hours and 42 minutes versus 9 hours and 32 minutes for the C++ version. -Accessibility and Cost-effectiveness: By drastically reducing the memory requirement, the build_k2_db tool makes the process of building large genomic databases more accessible to researchers and institutions with limited hardware resources. This can significantly lower the computational costs associated with database construction in bioinformatics research. - - -#### Conclusion -The build_k2_db tool stands out for its innovative approach to genomic database construction, offering a memory-efficient, scalable, and time-effective solution. Its ability to preprocess data into intermediary files before iteratively constructing the hash table addresses the significant challenges of working with large-scale genomic databases, making it an invaluable asset in the field of bioinformatics. - -The build_k2_db tool not only matches the Kraken 2 C++ version in terms of processing time but does so with far less memory, making it a highly efficient and accessible option for constructing large genomic databases. Its innovative approach, leveraging Rust's performance and memory management capabilities, offers a more practical solution for the bioinformatics community, particularly when handling extensive datasets like the NCBI RefSeq database. - - - -### Usage -To build the Kraken 2 database, you must specify source, hash table, taxonomy, ID to taxon map filenames, Kraken 2 options filename, NCBI taxonomy directory, required capacity, and chunk directory. - -```bash -build_k2_db [OPTIONS] --source -H -t -m -o --ncbi-taxonomy-directory --required-capacity --chunk-dir -``` - -### Options - -* --source : Directory or file for database build. -* -H : Filename for the Kraken 2 hash table. -* -t : Filename for the Kraken 2 taxonomy. -* -m : Filename for the sequence ID to taxon map. -* -o : Filename for Kraken 2 options. -* -n, --ncbi-taxonomy-directory : Directory name for NCBI taxonomy. -* -k, --k-mer : Length of k-mers (default: 35). -* -l, --l-mer : Length of minimizers (default: 31). -* -r, --requested-bits-for-taxid : Bit storage for taxid (default: 0). -* -T, --toggle-mask : Minimizer ordering toggle mask (default: 16392584516609989165). -* --minimizer-spaces : Characters in minimizer ignored in comparisons (default: 7). -* -c, --required-capacity : Required capacity for the database. -* -p, --threads : Number of threads (default: 4). -* --chunk-dir : Directory for chunks. -* --chunk-size : Size of chunks in GB (default: 1GB). -* --chunk-prefix : Prefix for chunk files (default: chunk). -* --only-k2: Process only k2 file. -* -h, --help: Prints help information. -* -V, --version: Prints the version of the tool. - -### Example - -Building a database with custom parameters: - -```bash -build_k2_db --source /path/to/source -H hash_table.k2 -t taxonomy.k2 -m id_to_taxon.map -o options.k2 --ncbi-taxonomy-directory /path/to/ncbi/taxonomy --required-capacity 1000000 --chunk-dir /path/to/chunks -``` - - -## 4. classify - -The classify tool is a powerful sequence classification program designed for rapid and accurate classification of nucleotide sequences. It leverages the Kraken 2 indexing and taxonomy systems to efficiently assign taxonomic labels to sequences from FASTA/FASTQ files. This document provides a comprehensive guide on how to use the classify tool, including its options and arguments. - -### Usage - -To classify sequences using the classify tool, execute the command with the required options and input files: - -```bash -classify [OPTIONS] --index-filename --taxonomy-filename --options-filename [INPUT_FILES]... -``` - -#### Arguments - -* [INPUT_FILES]...: Specifies a list of input file paths. These files should be in FASTA or FASTQ format and contain the sequences to be classified. - -#### Options - -* -H, --index-filename : Path to the Kraken 2 index file. This file is essential for the classification process. -* -t, --taxonomy-filename : Path to the Kraken 2 taxonomy file. This file contains taxonomic information used for classification. -* -o, --options-filename : Path to the Kraken 2 options file. This file includes additional configuration options for Kraken 2. -* -T, --confidence-threshold : Sets the confidence score threshold for classification. Sequences with a confidence score below this threshold will not be * classified. The default value is 0.0. -* -p, --num-threads : Specifies the number of threads to use for processing. Increasing the number of threads can speed up the classification process. The default is 10. -* -g, --minimum-hit-groups : The minimum number of hit groups required for a classification call. The default is 2. -* -P, --paired-end-processing: Enables processing of paired-end reads. This option should be used if your input files contain paired-end sequence data. -* -S, --single-file-pairs: Indicates that pairs with mates are located in the same file. This option is relevant for paired-end processing. -* -O, --kraken-output-filename : Specifies the file path for outputting the standard Kraken output. This output includes the classification results for * each sequence. -* -Q, --minimum-quality-score : Sets the minimum quality score for FASTQ data. Sequences with a quality score below this threshold will not be classified. * The default is 0. -* -h, --help: Prints help information, providing a summary of options and usage. -* -V, --version: Displays the version of the classify tool. - -#### Example - -To classify sequences from a FASTQ file using 4 threads and a confidence threshold of 0.5: - -```bash -classify --index-filename path/to/index --taxonomy-filename path/to/taxonomy --options-filename path/to/options -T 0.5 -p 4 input_file.fastq -``` - - -## 5. inspect - -The inspect tool is designed for analyzing the content of hash table files used by Kraken 2. It provides insights into the index file, allowing users to verify and understand the structure and statistics of their Kraken 2 databases. - -### Usage -To utilize the inspect tool, execute the command with the necessary options: - -```bash -inspect [OPTIONS] --index-filename -``` +3. Step-by-Step Processing Mode: -### Options -* -H, --index-filename : Specifies the file path to the Kraken 2 index file. This option is required as it directs the tool to the hash table file that needs to be inspected. -* -t, --taxonomy-filename : Provides the file path to the Kraken 2 taxonomy file. This file contains the taxonomy information that corresponds to the data in the index file. Including this option allows for a more comprehensive inspection that may involve taxonomy data. -* -o, --options-filename : Indicates the file path to the Kraken 2 options file. This file can contain various configurations and options used by Kraken 2. * Specifying this option can help understand the configurations under which the index was created or used. -* -v, --value-count: This flag, when set, instructs the tool to iterate through the index file and count the values. It is useful for users who wish to understand the * distribution of data within their Kraken 2 index file. -* -h, --help: Prints out help information, providing a brief summary of all the available options and their usage. -* -V, --version: Displays the version of the inspect tool, helping users to identify the tool's version they are currently using. +* Description: This mode breaks down the chunk processing mode into individual steps, providing greater flexibility in managing the entire classification process. +* Characteristics: + * Flexible processing steps + * Similar memory consumption to Chunk Processing Mode + * Performance varies based on execution steps diff --git a/cal_memory.sh b/cal_memory.sh new file mode 100644 index 0000000..92e1450 --- /dev/null +++ b/cal_memory.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +directory=$1 + +# Find all hash_*.k2d files and calculate their total size +total_size=$(find "$directory" -name "hash_*.k2d" -exec du -ch {} + | grep total$ | awk '{print $1}') +echo "Total size of hash_*.k2d files: $total_size" diff --git a/kr2r.sh b/kr2r.sh index 35b2ad8..bda6b88 100644 --- a/kr2r.sh +++ b/kr2r.sh @@ -1,24 +1,18 @@ - - DIR=`dirname $(realpath $0 || echo $0)` +DOWNLOADS="" DATABASE="" -DATABASE_CHUNK="" +CHUNK_DIR="" -# 1. 下载 bacteria,viral 原始fna.gz格式文件和md5文件 -${DIR}/ncbi --db $DATABASE gen -g bacteria,viral -# 1.1 校验md5文件 -${DIR}/ncbi --db $DATABASE gen -g bacteria,viral md5 - -# 1.2 下载taxonomy文件 -${DIR}/ncbi --db $DATABASE taxonomy +# 1. 下载 bacteria,viral 原始fna.gz格式文件和md5文件 +${DIR}/ncbi -d $DOWNLOADS gen -g bacteria,viral -# 2. 生成library.fna和prelim_map.txt子文件 -${DIR}/ncbi --db $DATABASE gen -g bacteria,viral fna +# 2. 下载taxonomy文件 +${DIR}/ncbi -d $DATABASE taxonomy -# 3. 预估数据库大小 -# ${DIR}/Kun estimate_capacity --db $DATABASE -k 35 -l 31 +# 3. build +${DIR}/kun_peng build -d $DATABASE --db $DATABASE -# 4. build -${DIR}/kun_peng build --db $DATABASE --chunk-dir ${DATABASE_CHUNK} +# 4. classify +./target/release/kun_peng classify --db $DATABASE --chunk-dir $CHUNK_DIR $the_sample_files