diff --git a/Cargo.toml b/Cargo.toml index 9f41efe..e0da8e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mgikit" -version = "0.1.2" +version = "0.1.3" edition = "2021" authors = ["Ziad Al Bkhetan "] repository = "https://github.com/sagc-bioinformatics/mgikit" diff --git a/README.md b/README.md index dd84925..0be9814 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,25 @@ This command is to merge demultiplexing and quality reports from multiple lanes
+## Installation + +You can use the static binary under bins directly, however, if you like to build it from the source code: + +You need to have Rust and cargo installed first, check rust [documenation](https://doc.rust-lang.org/cargo/getting-started/installation.html) + +```bash +git clone https://github.com/sagc-bioinformatics/mgikit.git +cd mgikit +cargo build --release +``` + + + ## User Guide Please checkout the [documeantion](https://sagc-bioinformatics.github.io/mgikit/) + ## Commerical Use Please contact us if you want to use the software for commercial purposes. diff --git a/bins/mgikit-V0.1.3.zip b/bins/mgikit-V0.1.3.zip new file mode 100644 index 0000000..7666aeb Binary files /dev/null and b/bins/mgikit-V0.1.3.zip differ diff --git a/bins/mgikit.zip b/bins/mgikit.zip deleted file mode 100644 index 4cf7ccd..0000000 Binary files a/bins/mgikit.zip and /dev/null differ diff --git a/docs/index.md b/docs/index.md index b950578..b7d82f5 100755 --- a/docs/index.md +++ b/docs/index.md @@ -41,6 +41,19 @@ This command is to merge demultiplexing and quality reports from multiple lanes
+## Installation + +You can use the static binary under bins directly, however, if you like to build it from the source code: + +You need to have Rust and cargo installed first, check rust [documenation](https://doc.rust-lang.org/cargo/getting-started/installation.html) + + +```bash +git clone https://github.com/sagc-bioinformatics/mgikit.git +cd mgikit +cargo build --release +``` + ## User Guide Table of Content {% include section-navigation-tiles.html type="guides" %} diff --git a/docs/pages/demultiplex.md b/docs/pages/demultiplex.md index 5180e4b..35664c2 100755 --- a/docs/pages/demultiplex.md +++ b/docs/pages/demultiplex.md @@ -129,7 +129,7 @@ the number of allowed mismatches is high. + **`--report-level`**: The level of reporting. [default: 2] -+ **`--compression-level`**: The level of compression (between 0 and 12). 0 is fast but no compression, 9 is slow but high compression. [default: 1] ++ **`--compression-level`**: The level of compression (between 0 and 12). 0 is fast but no compression, 12 is slow but high compression. [default: 1] + **`--force`**: this flag is to force the run and overwrite the existing output directory if exists. @@ -356,6 +356,127 @@ multiqc mgikit-examples/test/ ``` +### Performance evaluation + +Performance time (in minutes) evaluation and comparison on different datasets. +DS01 and DS04 are 10 bp dual index, DS02 and DS3 are 8 bp dual index and DS05 is 8 bp single index. +In the case of single-end, the R2 file of the dataset is used alone for demultiplexing. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatasetReadsSamplesLength (bp)Size (GB)Paired-endSingle-end
R1R2R1R2
DS01298303014102300320768571.537.2
DS0249466713639148172657561.531.8
DS0350660059529100124465543.530
DS04274567350528708.5191311.9
DS0550061238164508225.512-
+ +### Memory utilisation + +The default parameters of the tool are optimised to achive high performance. The majority of the memory needed is allocated for output buffering to reduce writing to disk operations. + +The expected memory usage is influnced yb three main factors, + +1. Number of samples in the sample sheet. +2. Writing buffer size (`--writing-buffer-size` parameter, default is `67108864`). +3. Compression buffer size (`--compression-buffer-size` parameter, default is `131072`). +4. Single end or paired end input data. + +The expected allocated memory is + ++ **Single-end input**: `number of smaples * (writing buffer size + 2 * compression buffer size)`. + ++ **Paired-end input**: `2 * number of smaples * (writing buffer size + 2 * compression buffer size)`. + +When using the default parameters: + ++ **Single-end input**: `number of smaples * 64.25 MB`. + ++ **Paired-end input**: `2 * number of smaples 64.25 MB`. + +Reducing the writing buffer size will reduce the reqiured memory but also affect the performance time. + + ### Execution examples You can use the datasets at `testing_data` to perform these tests. diff --git a/src/lib.rs b/src/lib.rs index 707754d..8ccb233 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,7 @@ use noodles_bgzf as bgzf; use memchr::memchr; use libdeflater::{Compressor, CompressionLvl}; use flate2::read::MultiGzDecoder; +//use sysinfo::{System, SystemExt}; // my modules @@ -30,6 +31,8 @@ use crate::index_dic::*; const BUFFER_SIZE: usize = 1 << 19; +const MAX_SAMPLES: usize = 500; + //const BLOCK_SIZE: usize = 1 << 16; //const BUFFER_SIZE_MIN:usize = 1000000; @@ -383,7 +386,8 @@ pub fn demultiplex( info_file: &String, reporting_level: usize, compression_level: u32, - dynamic_demultiplexing: bool + dynamic_demultiplexing: bool, + compression_buffer_size: usize ) -> Result<(), Box> { @@ -607,7 +611,8 @@ pub fn demultiplex( println!("The paired read files are assumed to contain the paired reads in the same order!"); - + + let mut illumina_header_template_p1 = String::new(); let mut output_file_format_r2; @@ -653,11 +658,25 @@ pub fn demultiplex( } println!("Trim Barcode: {}", trim_barcode); - // writing_threshold: usize, read_merging_threshold println!("Output buffer size: {}", writing_buffer_size); - let uncompressed_buffer_size = writing_buffer_size * 2 + 50000; - let relaxed_writing_buffer_size = ( writing_buffer_size as f32 * 0.95 ) as usize; + if writing_buffer_size < 65536{ + panic!("Writing buffer size '--writing-buffer-size' should be greater than 65536."); + } + + println!("Compression buffer size: {}", compression_buffer_size); + if compression_buffer_size > writing_buffer_size{ + panic!("Compression buffer size '--compression-buffer-size' should be less than Writing buffer size ('--writing-buffer-size')."); + } + + + let mut compressor = Compressor::new(CompressionLvl::new(compression_level as i32).unwrap()); + + + + let reqiured_output_buffer_size = writing_buffer_size + compressor.gzip_compress_bound(compression_buffer_size); + + let relaxed_writing_buffer_size = writing_buffer_size; //( writing_buffer_size as f32 * 0.95 ) as usize; // parse sample/index file and get all mismatches let tmp_res = parse_sample_index(Path::new(sample_sheet_file_path), &template, i7_rc, i5_rc).unwrap(); @@ -804,35 +823,37 @@ pub fn demultiplex( println!("It is assumed that read 2 contains barcode only without read sequence!"); } - + let total_samples:usize = sample_information.len(); let barcode_read_actual_length = (barcode_read_length - barcode_length) as u64; let paired_read_length_64 = paired_read_length as u64; //let barcode_read_length_64 = barcode_read_length as u64; let barcode_length_u64 = barcode_length as u64; - let mut extract_umi; - + + let mut sample_mismatches: Vec> = Vec::new(); let mut sample_statistics: Vec> = Vec::new(); let mut out_read_barcode_buffer: Vec> = Vec::new(); let mut out_paired_read_buffer: Vec> = Vec::new(); - let mut out_read_barcode_buffer_last: Vec = Vec::new(); - let mut out_paired_read_buffer_last : Vec = Vec::new(); + let mut out_read_barcode_buffer_last: [usize; MAX_SAMPLES] = [0; MAX_SAMPLES]; + let mut out_paired_read_buffer_last : [usize; MAX_SAMPLES] = [0; MAX_SAMPLES]; - let mut out_read_barcode_compression_buffer: Vec> = Vec::new(); - let mut out_paired_read_compression_buffer: Vec> = Vec::new(); - let mut out_read_barcode_compression_buffer_last: Vec = Vec::new(); - let mut out_paired_read_compression_buffer_last : Vec = Vec::new(); - let writen_barcode_length :usize = match trim_barcode { true => barcode_length, false => 0 }; + let mut out_read_barcode_compression_buffer_last: [usize; MAX_SAMPLES] = [0; MAX_SAMPLES]; + let mut out_paired_read_compression_buffer_last : [usize; MAX_SAMPLES] = [0; MAX_SAMPLES]; + + + + //let available_memory: u64 = System::new_all().get_available_memory(); + //println!("Available Memory is {} kB", available_memory); + let mut output_barcode_file_paths: Vec> = Vec::new(); let mut output_paired_file_paths: Vec> = Vec::new(); - for i in 0..sample_information.len(){ sample_mismatches.push(vec![0; 2 * allowed_mismatches + 2]); @@ -883,29 +904,7 @@ pub fn demultiplex( output_barcode_file_paths.push(Some(tmp.clone())); output_paired_file_paths.push(Some(tmp1.clone())); - /* - let output_file_path = Path::new(&tmp); - let outfile = OpenOptions::new() - .append(true) - .create(true) - .open(output_file_path) - .expect("couldn't create output"); - - output_barcode_file_writers.push(Some(outfile)); - - if !single_read_input { - let output_file_path = Path::new(&tmp1); - let outfile = OpenOptions::new() - .append(true) - .create(true) - .open(output_file_path) - .expect("couldn't create output"); - output_paired_file_writers.push(Some(outfile)); - }else{ - output_paired_file_writers.push(None); - } - */ }else{ output_barcode_file_paths.push(None); output_paired_file_paths.push(None); @@ -914,23 +913,31 @@ pub fn demultiplex( sample_statistics.push(vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); - out_read_barcode_buffer_last.push(0); - out_read_barcode_buffer.push(vec![0; writing_buffer_size]); - out_read_barcode_compression_buffer_last.push(0); - out_read_barcode_compression_buffer.push(vec![0; uncompressed_buffer_size]); - - - out_paired_read_buffer_last.push(0); - out_paired_read_compression_buffer_last.push(0); + if read2_has_sequence || i >= undetermined_label_id{ + out_read_barcode_buffer.push(vec![0; reqiured_output_buffer_size]); + + }else{ + out_read_barcode_buffer.push(vec![]); + } + out_read_barcode_compression_buffer_last[i] = i * compression_buffer_size; if ! single_read_input{ - out_paired_read_buffer.push( vec![0; writing_buffer_size]); - out_paired_read_compression_buffer.push( vec![0; uncompressed_buffer_size]); - + out_paired_read_buffer.push( vec![0; reqiured_output_buffer_size]); + out_paired_read_compression_buffer_last[i] = i * compression_buffer_size; } + } + let mut out_read_barcode_compression_buffer: Vec = vec![0; compression_buffer_size * total_samples]; + + let mut out_paired_read_compression_buffer = if ! single_read_input{ + vec![0; compression_buffer_size * total_samples] + }else{ + Vec::new() + }; + + //let mut curr_template; let mut sample_id; let mut barcode_read_illumina_header_start:usize = 0; @@ -960,22 +967,6 @@ pub fn demultiplex( tmp_reader }; - /* - let mut reader_paired_read: Option> = if !single_read_input { - if bgzf_format_in{ - Some(File::open(Path::new(&paired_read_file_path_final)).map(bgzf::Reader::new).unwrap()) - }else{ - Some(niffler::get_reader(Box::new(std::fs::File::open(paired_read_file_path_final).unwrap())).unwrap()) - } - } else { - None - }; - - */ - - - //let (mut reader_barcode_read, _)= niffler::get_reader(Box::new(std::fs::File::open(read_barcode_file_path_final).unwrap())).unwrap(); - let mut reader_paired_read = if !single_read_input { let (s1, _) = niffler::get_reader(Box::new(std::fs::File::open(paired_read_file_path_final).unwrap())).unwrap(); Some(s1) @@ -1019,12 +1010,12 @@ pub fn demultiplex( let mut curr_buffer_end; let mut template_itr:usize; - let mut compressor = Compressor::new(CompressionLvl::new(compression_level as i32).unwrap()); let mut actual_sz:usize; let mut curr_writing_sample:usize; - let total_samples:usize = sample_information.len(); + let mut curr_writer: File; + let mut curr_buffer_limit:usize; loop { //println!("Read: {}", read_cntr); @@ -1381,17 +1372,21 @@ pub fn demultiplex( sample_mismatches[sample_id][0] += 1; sample_mismatches[sample_id][curr_mismatch + 1] += 1; + + curr_writing_sample = writing_samples[sample_id]; // writing preperation - curr_buffer_end = out_read_barcode_buffer_last[curr_writing_sample]; + curr_buffer_end = out_read_barcode_compression_buffer_last[curr_writing_sample]; + curr_buffer_limit = curr_writing_sample * compression_buffer_size + compression_buffer_size; // this works for mgi format and unde and ambig and ilumina with a bit of extr - if curr_buffer_end + read_end - header_start + illumina_header_template_bytes.len() >= writing_buffer_size{ - - actual_sz = compressor.gzip_compress(&out_read_barcode_buffer[curr_writing_sample][..curr_buffer_end], &mut out_read_barcode_compression_buffer[curr_writing_sample][out_read_barcode_compression_buffer_last[curr_writing_sample]..]).unwrap(); - out_read_barcode_compression_buffer_last[curr_writing_sample] += actual_sz; - curr_buffer_end = 0; + if curr_buffer_end + read_end - header_start + illumina_header_template_bytes.len() >= curr_buffer_limit { + actual_sz = compressor.gzip_compress(&out_read_barcode_compression_buffer[curr_writing_sample * compression_buffer_size..curr_buffer_end], + &mut out_read_barcode_buffer[curr_writing_sample][out_read_barcode_buffer_last[curr_writing_sample]..]).unwrap(); + out_read_barcode_buffer_last[curr_writing_sample] += actual_sz; + + curr_buffer_end = curr_writing_sample * compression_buffer_size; - if out_read_barcode_compression_buffer_last[curr_writing_sample] >= relaxed_writing_buffer_size { + if out_read_barcode_buffer_last[curr_writing_sample] >= relaxed_writing_buffer_size { match &output_barcode_file_paths[curr_writing_sample]{ Some(output_file_path) => { curr_writer = OpenOptions::new() @@ -1399,41 +1394,40 @@ pub fn demultiplex( .create(true) .open(output_file_path) .expect("couldn't create output"); - curr_writer.write_all(&out_read_barcode_compression_buffer[curr_writing_sample][..out_read_barcode_compression_buffer_last[curr_writing_sample]]).unwrap(); + curr_writer.write_all(&out_read_barcode_buffer[curr_writing_sample][..out_read_barcode_buffer_last[curr_writing_sample]]).unwrap(); curr_writer.flush().expect("couldn't flush output"); - out_read_barcode_compression_buffer_last[curr_writing_sample] = 0; + out_read_barcode_buffer_last[curr_writing_sample] = 0; }, None => panic!("expeted a writer, but None found!") }; - } - + } } if sample_id >= undetermined_label_id{ - out_read_barcode_buffer[curr_writing_sample][curr_buffer_end..curr_buffer_end + read_end - header_start + 1]. + out_read_barcode_compression_buffer[curr_buffer_end..curr_buffer_end + read_end - header_start + 1]. copy_from_slice(&buffer_2[header_start..read_end + 1]); - out_read_barcode_buffer_last[curr_writing_sample] = curr_buffer_end + read_end - header_start + 1; + out_read_barcode_compression_buffer_last[curr_writing_sample] = curr_buffer_end + read_end - header_start + 1; }else if read2_has_sequence { if illumina_format{ // Illumina format write the heder and skip the header for mgi. barcode_read_illumina_header_start = curr_buffer_end; - out_read_barcode_buffer[curr_writing_sample][curr_buffer_end..curr_buffer_end + illumina_header_template_bytes.len()]. + out_read_barcode_compression_buffer[curr_buffer_end..curr_buffer_end + illumina_header_template_bytes.len()]. copy_from_slice(&illumina_header_template_bytes[..]); curr_buffer_end += illumina_header_template_bytes.len(); - curr_buffer_end = write_illumina_header(&mut out_read_barcode_buffer[curr_writing_sample], + curr_buffer_end = write_illumina_header(&mut out_read_barcode_compression_buffer, curr_buffer_end, &buffer_2[header_start..seq_start], &curr_umi.as_bytes(), l_position); - out_read_barcode_buffer[curr_writing_sample][curr_buffer_end..curr_buffer_end + curr_barcode.len()] + out_read_barcode_compression_buffer[curr_buffer_end..curr_buffer_end + curr_barcode.len()] .copy_from_slice(&curr_barcode.as_bytes()); curr_buffer_end += curr_barcode.len(); barcode_read_illumina_header_end = curr_buffer_end; @@ -1441,35 +1435,29 @@ pub fn demultiplex( } - out_read_barcode_buffer[curr_writing_sample][curr_buffer_end..curr_buffer_end + plus_start - header_start - writen_barcode_length - 1]. + out_read_barcode_compression_buffer[curr_buffer_end..curr_buffer_end + plus_start - header_start - writen_barcode_length - 1]. copy_from_slice(&buffer_2[header_start..plus_start - writen_barcode_length - 1]); curr_buffer_end += plus_start - header_start - writen_barcode_length - 1; - out_read_barcode_buffer[curr_writing_sample][curr_buffer_end..curr_buffer_end + read_end - plus_start - writen_barcode_length + 1]. + out_read_barcode_compression_buffer[curr_buffer_end..curr_buffer_end + read_end - plus_start - writen_barcode_length + 1]. copy_from_slice(&buffer_2[plus_start - 1..read_end - writen_barcode_length]); curr_buffer_end += read_end - writen_barcode_length - plus_start + 2; - out_read_barcode_buffer[curr_writing_sample][curr_buffer_end - 1] = b'\n'; - out_read_barcode_buffer_last[curr_writing_sample] = curr_buffer_end; + out_read_barcode_compression_buffer[curr_buffer_end - 1] = b'\n'; + out_read_barcode_compression_buffer_last[curr_writing_sample] = curr_buffer_end; } if ! single_read_input{ - curr_buffer_end = out_paired_read_buffer_last[curr_writing_sample]; - /* - println!("{} - {} - {} - {} - {} --- {} : {}", writing_buffer_size, relaxed_writing_buffer_size, uncompressed_buffer_size, - curr_buffer_end, - out_paired_read_compression_buffer_last[curr_writing_sample], - compressor.gzip_compress_bound(out_paired_read_compression_buffer_last[curr_writing_sample]), - out_paired_read_compression_buffer[curr_writing_sample][out_paired_read_compression_buffer_last[curr_writing_sample]..].len()); - */ - if curr_buffer_end + read_end_pr - header_start_pr + illumina_header_template_bytes.len() >= writing_buffer_size{ - actual_sz = compressor.gzip_compress(&out_paired_read_buffer[curr_writing_sample][..curr_buffer_end], &mut out_paired_read_compression_buffer[curr_writing_sample][out_paired_read_compression_buffer_last[curr_writing_sample]..]).unwrap(); - out_paired_read_compression_buffer_last[curr_writing_sample] += actual_sz; - curr_buffer_end = 0; - - if out_paired_read_compression_buffer_last[curr_writing_sample] >= relaxed_writing_buffer_size { + curr_buffer_end = out_paired_read_compression_buffer_last[curr_writing_sample]; + if curr_buffer_end + read_end_pr - header_start_pr + illumina_header_template_bytes.len() >= curr_buffer_limit{ + actual_sz = compressor.gzip_compress(&out_paired_read_compression_buffer[curr_writing_sample * compression_buffer_size .. curr_buffer_end], + &mut out_paired_read_buffer[curr_writing_sample][out_paired_read_buffer_last[curr_writing_sample]..]).unwrap(); + out_paired_read_buffer_last[curr_writing_sample] += actual_sz; + curr_buffer_end = curr_writing_sample * compression_buffer_size; + + if out_paired_read_buffer_last[curr_writing_sample] >= relaxed_writing_buffer_size { match &output_paired_file_paths[curr_writing_sample]{ Some(output_file_path) => { curr_writer = OpenOptions::new() @@ -1477,9 +1465,9 @@ pub fn demultiplex( .create(true) .open(output_file_path) .expect("couldn't create output"); - curr_writer.write_all(&out_paired_read_compression_buffer[curr_writing_sample][..out_paired_read_compression_buffer_last[curr_writing_sample]]).unwrap(); + curr_writer.write_all(&out_paired_read_buffer[curr_writing_sample][..out_paired_read_buffer_last[curr_writing_sample]]).unwrap(); curr_writer.flush().expect("couldn't flush output"); - out_paired_read_compression_buffer_last[curr_writing_sample] = 0; + out_paired_read_buffer_last[curr_writing_sample] = 0; }, None => panic!("expeted a writer, but None found!") @@ -1488,18 +1476,36 @@ pub fn demultiplex( } if illumina_format && sample_id < undetermined_label_id{ - out_paired_read_buffer[curr_writing_sample][curr_buffer_end..curr_buffer_end + barcode_read_illumina_header_end - barcode_read_illumina_header_start]. - copy_from_slice(&out_read_barcode_buffer[curr_writing_sample][barcode_read_illumina_header_start..barcode_read_illumina_header_end]); - curr_buffer_end += barcode_read_illumina_header_end - barcode_read_illumina_header_start; - out_paired_read_buffer[curr_writing_sample][curr_buffer_end - curr_barcode.len() - 6] = buffer_1[seq_start_pr - 2]; + + if read2_has_sequence{ + out_paired_read_compression_buffer[curr_buffer_end..curr_buffer_end + barcode_read_illumina_header_end - barcode_read_illumina_header_start]. + copy_from_slice(&out_read_barcode_compression_buffer[barcode_read_illumina_header_start..barcode_read_illumina_header_end]); + curr_buffer_end += barcode_read_illumina_header_end - barcode_read_illumina_header_start; + out_paired_read_compression_buffer[curr_buffer_end - curr_barcode.len() - 6] = buffer_1[seq_start_pr - 2]; + }else{ + + out_paired_read_compression_buffer[curr_buffer_end..curr_buffer_end + illumina_header_template_bytes.len()]. + copy_from_slice(&illumina_header_template_bytes[..]); + curr_buffer_end += illumina_header_template_bytes.len(); + + curr_buffer_end = write_illumina_header(&mut out_paired_read_compression_buffer, + curr_buffer_end, &buffer_1[header_start_pr..seq_start_pr], + &curr_umi.as_bytes(), l_position); + + out_paired_read_compression_buffer[curr_buffer_end..curr_buffer_end + curr_barcode.len()] + .copy_from_slice(&curr_barcode.as_bytes()); + curr_buffer_end += curr_barcode.len(); + + } + header_start_pr = seq_start_pr - 1; } - out_paired_read_buffer[curr_writing_sample][curr_buffer_end..curr_buffer_end + read_end_pr - header_start_pr + 1]. + out_paired_read_compression_buffer[curr_buffer_end..curr_buffer_end + read_end_pr - header_start_pr + 1]. copy_from_slice(&buffer_1[header_start_pr..read_end_pr + 1]); - out_paired_read_buffer_last[curr_writing_sample] = curr_buffer_end + read_end_pr - header_start_pr + 1; + out_paired_read_compression_buffer_last[curr_writing_sample] = curr_buffer_end + read_end_pr - header_start_pr + 1; if read_end_pr + whole_paired_read_len >= read_bytes_1 { if read_end_pr < read_bytes_1 - 1 && header_start_pr > 0 { @@ -1559,7 +1565,7 @@ pub fn demultiplex( - for sample_id in 0..out_read_barcode_buffer.len() { + for sample_id in 0..total_samples { sample_statistics[sample_id][3] = paired_read_length_64 * sample_mismatches[sample_id][0]; sample_statistics[sample_id][6] -= sample_statistics[sample_id][3] * 33; @@ -1571,9 +1577,10 @@ pub fn demultiplex( if curr_writing_sample == sample_id { if !single_read_input { - if out_paired_read_buffer_last[curr_writing_sample] > 0{ - actual_sz = compressor.gzip_compress(&out_paired_read_buffer[curr_writing_sample][..out_paired_read_buffer_last[curr_writing_sample]], &mut out_paired_read_compression_buffer[curr_writing_sample][out_paired_read_compression_buffer_last[curr_writing_sample]..]).unwrap(); - out_paired_read_compression_buffer_last[curr_writing_sample] += actual_sz; + if out_paired_read_compression_buffer_last[curr_writing_sample] > curr_writing_sample * compression_buffer_size { + actual_sz = compressor.gzip_compress(&out_paired_read_compression_buffer[curr_writing_sample * compression_buffer_size..out_paired_read_compression_buffer_last[curr_writing_sample]], + &mut out_paired_read_buffer[curr_writing_sample][out_paired_read_buffer_last[curr_writing_sample]..]).unwrap(); + out_paired_read_buffer_last[curr_writing_sample] += actual_sz; match &output_paired_file_paths[curr_writing_sample]{ Some(output_file_path) => { @@ -1582,8 +1589,8 @@ pub fn demultiplex( .create(true) .open(output_file_path) .expect("couldn't create output"); - curr_writer.write_all(&out_paired_read_compression_buffer[curr_writing_sample][..out_paired_read_compression_buffer_last[curr_writing_sample]]).unwrap(); - out_paired_read_compression_buffer_last[curr_writing_sample] = 0; + curr_writer.write_all(&out_paired_read_buffer[curr_writing_sample][..out_paired_read_buffer_last[curr_writing_sample]]).unwrap(); + out_paired_read_buffer_last[curr_writing_sample] = 0; curr_writer.flush().unwrap(); }, None => panic!("expeted a writer, but None found!") @@ -1592,9 +1599,10 @@ pub fn demultiplex( } - if out_read_barcode_buffer_last[curr_writing_sample] > 0{ - actual_sz = compressor.gzip_compress( &out_read_barcode_buffer[curr_writing_sample][..out_read_barcode_buffer_last[curr_writing_sample]], &mut out_read_barcode_compression_buffer[curr_writing_sample][out_read_barcode_compression_buffer_last[curr_writing_sample]..]).unwrap(); - out_read_barcode_compression_buffer_last[curr_writing_sample] += actual_sz; + if out_read_barcode_compression_buffer_last[curr_writing_sample] > curr_writing_sample * compression_buffer_size{ + actual_sz = compressor.gzip_compress(&out_read_barcode_compression_buffer[curr_writing_sample * compression_buffer_size ..out_read_barcode_compression_buffer_last[curr_writing_sample]], + &mut out_read_barcode_buffer[curr_writing_sample][out_read_barcode_buffer_last[curr_writing_sample]..]).unwrap(); + out_read_barcode_buffer_last[curr_writing_sample] += actual_sz; match &output_barcode_file_paths[curr_writing_sample]{ Some(output_file_path) => { @@ -1603,8 +1611,8 @@ pub fn demultiplex( .create(true) .open(output_file_path) .expect("couldn't create output"); - curr_writer.write_all(&out_read_barcode_compression_buffer[curr_writing_sample][..out_read_barcode_compression_buffer_last[curr_writing_sample]]).unwrap(); - out_read_barcode_compression_buffer_last[curr_writing_sample] = 0; + curr_writer.write_all(&out_read_barcode_buffer[curr_writing_sample][..out_read_barcode_buffer_last[curr_writing_sample]]).unwrap(); + out_read_barcode_buffer_last[curr_writing_sample] = 0; curr_writer.flush().unwrap(); }, diff --git a/src/main.rs b/src/main.rs index 0ff4fc8..5371270 100644 --- a/src/main.rs +++ b/src/main.rs @@ -239,13 +239,19 @@ fn main() { .long("compression-level") .default_value("1") .value_parser(clap::value_parser!(u32)) - .help("The level of compression (between 0 and 9). 0 is fast but no compression, 9 is slow but high compression.") + .help("The level of compression (between 0 and 12). 0 is fast but no compression, 12 is slow but high compression.") ).arg( Arg::new("arg_dynamic") .long("flexible") .action(ArgAction::SetTrue) .default_value("false") .help("Determine reads based on the new lines rather than the expect length of the read parts.") + ).arg( + Arg::new("arg_compression_buffer_size") + .long("compression-buffer-size") + .default_value("131072") + .value_parser(clap::value_parser!(usize)) + .help("The size of the buffer for data compression for each sample.") ) ) .subcommand( @@ -373,6 +379,8 @@ fn main() { let arg_report_level: &usize = demultiplex_command.get_one::("arg_report_level").unwrap(); let arg_compression_level: &u32 = demultiplex_command.get_one::("arg_compression_level").unwrap(); let arg_dynamic: &bool = demultiplex_command.get_one::("arg_dynamic").unwrap(); + let arg_compression_buffer_size: &usize = demultiplex_command.get_one::("arg_compression_buffer_size").unwrap(); + match demultiplex( arg_input_folder_path, @@ -401,7 +409,8 @@ fn main() { arg_info_file, *arg_report_level, *arg_compression_level, - *arg_dynamic + *arg_dynamic, + *arg_compression_buffer_size ) { Ok(_) => {}, Err(err) => eprintln!("Error: {}", err), diff --git a/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.general b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.general new file mode 100755 index 0000000..0aaf37f --- /dev/null +++ b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.general @@ -0,0 +1,10 @@ +#Lane statistics +Run ID-Lane Mb Total Yield M Total Clusters % bases ≥ Q30 Mean Quality % Perfect Index +FC01-L01 0.001 0.00002 51.700 28.432 100.000 +#sample general info +Sample ID M Clusters Mb Yield ≥ Q30 % R1 Yield ≥ Q30 % R2 Yield ≥ Q30 % R3 Yield ≥ Q30 % Perfect Index +Undetermined 0.00001 0.00025 50.000 0.000 60.000 100.000 +Sample03 0.000004 0 0.000 0.000 100.000 100.000 +Sample04 0.000004 0.000167 83.500 0.000 0.000 100.000 +Sample01 0.000001 0.00005 100.000 0.000 100.000 100.000 +Sample02 0.000001 0.00005 100.000 0.000 0.000 100.000 diff --git a/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.info b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.info new file mode 100755 index 0000000..80a0fc4 --- /dev/null +++ b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.info @@ -0,0 +1,6 @@ +sample 0-mismatches +Undetermined 10 +Sample03 4 +Sample04 4 +Sample01 1 +Sample02 1 diff --git a/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.sample_stats b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.sample_stats new file mode 100755 index 0000000..697a29c --- /dev/null +++ b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.sample_stats @@ -0,0 +1,6 @@ +job_number sample_id r1_qc_30 r2_qc_30 r3_qc_30 r1_bases r2_bases r3_bases r1_qc r2_qc r3_qc all_reads 0-mismatches +. Sample01 50 0 16 50 0 16 1650 0 480 1 1 +. Sample02 50 0 0 50 0 16 1700 0 416 1 1 +. Sample03 0 0 64 200 0 64 5600 0 2176 4 4 +. Sample04 167 0 0 200 0 64 6162 0 1792 4 4 +. Undetermined 250 0 96 500 0 160 13320 0 4768 10 10 diff --git a/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.undetermined_barcode b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.undetermined_barcode new file mode 100755 index 0000000..64beb5b --- /dev/null +++ b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.undetermined_barcode @@ -0,0 +1,10 @@ +GGATATAT+GGNGCGCG 1 +ATATATAT+CGCGCGCG 1 +ATAGGTAT+CGCTTTCG 1 +ACTTTAAT+GGCTAGAG 1 +ACGTGCAT+GGCTAGAG 1 +ACGTGCAT+GGCGAGAG 1 +ACGGGCAT+GGCTNGAG 1 +ACGCTAAT+GGGGAGAG 1 +ACGCGTAT+GGCTAGAG 1 +ACGCGAAT+GGCTGGAG 1 diff --git a/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.undetermined_barcode.complete b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.undetermined_barcode.complete new file mode 100755 index 0000000..64beb5b --- /dev/null +++ b/testing_data/expected/ds010/ds010-0/FC01.L01.mgikit.undetermined_barcode.complete @@ -0,0 +1,10 @@ +GGATATAT+GGNGCGCG 1 +ATATATAT+CGCGCGCG 1 +ATAGGTAT+CGCTTTCG 1 +ACTTTAAT+GGCTAGAG 1 +ACGTGCAT+GGCTAGAG 1 +ACGTGCAT+GGCGAGAG 1 +ACGGGCAT+GGCTNGAG 1 +ACGCTAAT+GGGGAGAG 1 +ACGCGTAT+GGCTAGAG 1 +ACGCGAAT+GGCTGGAG 1 diff --git a/testing_data/expected/ds010/ds010-0/Sample01_S1_L01_R1_001.fastq.gz b/testing_data/expected/ds010/ds010-0/Sample01_S1_L01_R1_001.fastq.gz new file mode 100755 index 0000000..a9e41f2 Binary files /dev/null and b/testing_data/expected/ds010/ds010-0/Sample01_S1_L01_R1_001.fastq.gz differ diff --git a/testing_data/expected/ds010/ds010-0/Sample02_S2_L01_R1_001.fastq.gz b/testing_data/expected/ds010/ds010-0/Sample02_S2_L01_R1_001.fastq.gz new file mode 100755 index 0000000..1db0d36 Binary files /dev/null and b/testing_data/expected/ds010/ds010-0/Sample02_S2_L01_R1_001.fastq.gz differ diff --git a/testing_data/expected/ds010/ds010-0/Sample03_S3_L01_R1_001.fastq.gz b/testing_data/expected/ds010/ds010-0/Sample03_S3_L01_R1_001.fastq.gz new file mode 100755 index 0000000..35375fd Binary files /dev/null and b/testing_data/expected/ds010/ds010-0/Sample03_S3_L01_R1_001.fastq.gz differ diff --git a/testing_data/expected/ds010/ds010-0/Sample04_S4_L01_R1_001.fastq.gz b/testing_data/expected/ds010/ds010-0/Sample04_S4_L01_R1_001.fastq.gz new file mode 100755 index 0000000..ff14a62 Binary files /dev/null and b/testing_data/expected/ds010/ds010-0/Sample04_S4_L01_R1_001.fastq.gz differ diff --git a/testing_data/expected/ds010/ds010-0/Undetermined_L01_R1_001.fastq.gz b/testing_data/expected/ds010/ds010-0/Undetermined_L01_R1_001.fastq.gz new file mode 100755 index 0000000..266dbfa Binary files /dev/null and b/testing_data/expected/ds010/ds010-0/Undetermined_L01_R1_001.fastq.gz differ diff --git a/testing_data/expected/ds010/ds010-0/Undetermined_L01_R2_001.fastq.gz b/testing_data/expected/ds010/ds010-0/Undetermined_L01_R2_001.fastq.gz new file mode 100755 index 0000000..533d331 Binary files /dev/null and b/testing_data/expected/ds010/ds010-0/Undetermined_L01_R2_001.fastq.gz differ diff --git a/testing_data/expected/ds010/sample_sheet_expected.tsv b/testing_data/expected/ds010/sample_sheet_expected.tsv new file mode 100644 index 0000000..ee6f8f8 --- /dev/null +++ b/testing_data/expected/ds010/sample_sheet_expected.tsv @@ -0,0 +1,5 @@ +sample_id i7 i5 job_number template i7_rc i5_rc +Sample01 ACGGGCAT GGCTAGAG . i78:i58 0 0 +Sample02 ACGCTAAT GGCTAGAG . i78:i58 0 0 +Sample03 ACGCCCAT GGCCAGAG . i78:i58 0 0 +Sample04 GGGTCGAT GGGGACTA . i78:i58 0 0 diff --git a/testing_data/input/ds010/L01/BioInfo.csv b/testing_data/input/ds010/L01/BioInfo.csv new file mode 100755 index 0000000..6bf0bf1 --- /dev/null +++ b/testing_data/input/ds010/L01/BioInfo.csv @@ -0,0 +1,23 @@ +ISW Version, +Machine ID,R2100000000001 +Sequence Type, +Recipe Version, +Sequence Date,2023-03-30 +Sequence Time,12:00:00 +Sequencing Cartridge ID, +Cleaning Cartridge ID, +Flow Cell ID, +Flow Cell Pos, +Barcode Type, +Barcode File, +Read1, +Read2, +Barcode, +Dual Barcode, +Read1 Dark Reaction, +Read2 Dark Reaction, +Resume Cycles, +Read1 Camera Digital Gains, +Read2 Camera Digital Gains, +Full Sequencing Cartridge ID, +Full Cleaning Cartridge ID, diff --git a/testing_data/input/ds010/L01/FC01_L01_read_1.fq.gz b/testing_data/input/ds010/L01/FC01_L01_read_1.fq.gz new file mode 100755 index 0000000..b183486 Binary files /dev/null and b/testing_data/input/ds010/L01/FC01_L01_read_1.fq.gz differ diff --git a/testing_data/input/ds010/L01/FC01_L01_read_2.fq.gz b/testing_data/input/ds010/L01/FC01_L01_read_2.fq.gz new file mode 100755 index 0000000..1cb62aa Binary files /dev/null and b/testing_data/input/ds010/L01/FC01_L01_read_2.fq.gz differ diff --git a/testing_data/input/ds010/info.txt b/testing_data/input/ds010/info.txt new file mode 100755 index 0000000..255f999 --- /dev/null +++ b/testing_data/input/ds010/info.txt @@ -0,0 +1,170 @@ +Taken from DS01. no R2 sequence. Just barcode + +Read length = 8 +i7 length = 8 +i5 length = 8 + +indexes: + +S1: + +ACGGGCAT GGCTAGAG +ACGCTAAT GGCTAGAG +ACGCCCAT GGCCAGAG +GGGTCGAT GGGGACTA + + +S1 and S2 ( 3bp in i7 are different ) +S3 and S2 2 bp in i7 and 1bp in i5 +S3 and S1 2 bp in i7 and 1bp in i5 +S4 many bp are different + + +scores: +R1: (formatted as count * QS) +#1: 40 * 0 + 10 * 22 +#2: 50 * 4 +#3: 50 * 26 +#4: 50 * 33 +#5: 50 * 34 +#6: 50 * 34 +#7: 50 * 34 +#8: 50 * 34 +#9: 50 * 28 +#10: 50 * 28 +#11: 50 * 28 +#12: 50 * 28 +#13: 32 * 30 and 1 * 20 and 17 * 40 +#14: 32 * 16 + 18 * 30 +#15: 50 * 39 +#16: 50 * 30 +#17: 50 * 28 +#18: 50 * 34 +#19: 50 * 28 +#20: 50 * 40 + +R2: (formatted as count * QS) +#1: 1 * 20 and 32 * 30, and 33 * 40 (16 in these barcode and rest are in the read) +#2: 32 * 16 + 34 * 30 (16 in the barcode) +#3: 66 * 39 (16 in the barcode) +#4: 66 * 30 (16 in the barcode) +#5: 40 * 0 + 26 * 22 (16 in the barcode) +#6: 66 * 4 (16 in the barcode) +#7: 66 * 26 (16 in the barcode) +#8: 66 * 33 (16 in the barcode) +#9: 66 * 34 (16 in the barcode) +#10: 66 * 34 (16 in the barcode) +#11: 66 * 34 (16 in the barcode) +#12: 66 * 34 (16 in the barcode) +#13: 66 * 28(16 in the barcode) +#14: 66 * 28(16 in the barcode) +#15: 66 * 28(16 in the barcode) +#16: 66 * 28(16 in the barcode) +#17: 66 * 28 (16 in the barcode) +#18: 66 * 34 (16 in the barcode) +#19: 66 * 28 (16 in the barcode) +#20: 66 * 40 (16 in the barcode) + + +readqc: + +1 40 0 R1 +2 50 4 R1 +3 50 26 R1 +4 50 33 R1 +5 50 34 R1 +6 50 34 R1 +7 50 34 R1 +8 50 34 R1 +9 50 28 R1 +10 50 28 R1 +11 50 28 R1 +12 50 28 R1 +13 32 30 R1 +14 32 16 R1 +15 50 39 R1 +16 50 30 R1 +17 50 28 R1 +18 50 34 R1 +19 50 28 R1 +20 50 40 R1 +13 1 20 R1 +13 17 40 R1 +14 18 30 R1 +1 10 22 R1 +1 1 20 R2 +2 32 16 R2 +3 50 39 R2 +4 50 30 R2 +5 40 0 R2 +6 50 4 R2 +7 50 26 R2 +8 50 33 R2 +9 50 34 R2 +10 50 34 R2 +11 50 34 R2 +12 50 34 R2 +13 50 28 R2 +14 50 28 R2 +15 50 28 R2 +16 50 28 R2 +17 50 28 R2 +18 50 34 R2 +19 50 28 R2 +20 50 40 R2 +1 17 40 R2 +1 32 30 R2 +2 18 30 R2 +5 10 22 R2 +1 16 40 R3 +2 16 30 R3 +3 16 39 R3 +4 16 30 R3 +5 16 22 R3 +6 16 4 R3 +7 16 26 R3 +8 16 33 R3 +9 16 34 R3 +10 16 34 R3 +11 16 34 R3 +12 16 34 R3 +13 16 28 R3 +14 16 28 R3 +15 16 28 R3 +16 16 28 R3 +17 16 28 R3 +18 16 34 R3 +19 16 28 R3 +20 16 40 R3 + + + +Matches: (mismatches : samples) +#1: (0:) (1: s1,) (2: s1,) (3: ) (4: s2, s3) +#2: (0:) (1: s1) (2:) (3: s2, s3) (4: ) +#3: (0:) (1: ) (2: s1) (3:s3 ) (4: ) +#4: (0: s1) (1: ) (2: s1) (3: s2, s3) (4: ) + +#5: (0:) (1: ) (2: s2) (3: ) (4: s1) +#6: (0:) (1: ) (2: s2) (3: s2, s3) (4: s2, s1) +#7: (0:s2) (1: ) (2: ) (3: s1, s3) (4: ) +#8: (0:) (1: ) (2: s2 ) (3: ) (4: s3) + +#9: (0: s3) (1: ) (2: ) (3: s1, s2) (4: ) +#10: (0: s3) (1: ) (2: ) (3: s1, s2) (4: ) +#11: (0:s3) (1: ) (2: ) (3: s1,s2) (4: ) +#12: (0: s3) (1: ) (2: ) (3: s1, s2) (4: ) + +#13: (0:s4) (1: ) (2: ) (3: ) (4: ) +#14: (0:s4) (1: ) (2: ) (3: ) (4: ) +#15: (0:s4) (1: ) (2: ) (3: ) (4: ) +#16: (0:s4) (1: ) (2: ) (3: ) (4 : ) + + +#17: (0:) (1: ) (2: ) (3: ) (4: ) +#18: (0:) (1: ) (2: ) (3: ) (4: ) +#19: (0:) (1: ) (2: ) (3: ) (4: ) + +#20: (0:) (1: ) (2: s1, s2) (3: s3) (4: ) + + diff --git a/testing_data/input/ds010/sample_sheet.tsv b/testing_data/input/ds010/sample_sheet.tsv new file mode 100755 index 0000000..7ec3fdc --- /dev/null +++ b/testing_data/input/ds010/sample_sheet.tsv @@ -0,0 +1,5 @@ +sample_id i7 i5 i7_rc i5_rc template +Sample01 ACGGGCAT GGCTAGAG 0 0 i78:i58 +Sample02 ACGCTAAT GGCTAGAG 0 0 i78:i58 +Sample03 ACGCCCAT GGCCAGAG 0 0 i78:i58 +Sample04 GGGTCGAT GGGGACTA 0 0 i78:i58 diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 534b702..8f10ba5 100755 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -11,7 +11,7 @@ use walkdir::WalkDir; fn get_hash(file_path: &String) -> Vec { println!("Getting hash for the file {}.", file_path); let mut f = File::open(file_path).unwrap(); - let mut buffer = Vec::new(); + let mut buffer:Vec = Vec::new(); f.read_to_end(&mut buffer).unwrap(); buffer } @@ -159,7 +159,7 @@ fn testing_template() { #[test] fn testing_demultiplex() { - for ds_itr_tmp in 1..10{ + for ds_itr_tmp in 1..11{ let mut disable_illumina_format = false; let ds_itr_in = match ds_itr_tmp{ 6 => 1, @@ -175,22 +175,24 @@ fn testing_demultiplex() { _ => ds_itr_tmp }; + let ds_itr_fc = match ds_itr_in{ + 10 => 1, + _ => ds_itr_in + }; + let input_folder_path = match ds_itr_tmp == 6 { false => String::new(), true => String::from(format!("testing_data/input/ds0{}/L01/", ds_itr_in)) }; - let read1_file_path : String = String::from(format!("testing_data/input/ds0{}/L01/FC0{}_L01_read_1.fq.gz", ds_itr_in, ds_itr_in)); - let read2_file_path : String = String::from(format!("testing_data/input/ds0{}/L01/FC0{}_L01_read_2.fq.gz", ds_itr_in, ds_itr_in)); + let read1_file_path : String = String::from(format!("testing_data/input/ds0{}/L01/FC0{}_L01_read_1.fq.gz", ds_itr_in, ds_itr_fc)); + let read2_file_path : String = String::from(format!("testing_data/input/ds0{}/L01/FC0{}_L01_read_2.fq.gz", ds_itr_in, ds_itr_fc)); let sample_sheet_file_path : String = String::from(format!("testing_data/expected/ds0{}/sample_sheet_expected.tsv", ds_itr_ex)); let lane = String::from("L01"); let mut instrument = String::from("instrument_1"); let mut run = String::from("20231212"); let mut comprehensive_scan = false; - let mut digest_new; - let mut digest_original; - if ds_itr_tmp == 5 || ds_itr_tmp == 2 { comprehensive_scan = true; } @@ -293,8 +295,9 @@ fn testing_demultiplex() { assert_eq!(crc_new, crc_original); }else{ - digest_new = md5::compute(get_hash(&format!("{}{}", ouput_dir, &path.as_ref().unwrap().file_name().to_str().unwrap()))); - digest_original = md5::compute(get_hash(&format!("{}", &path.unwrap().path().display()))); + + let digest_new = md5::compute(get_hash(&format!("{}{}", ouput_dir, &path.as_ref().unwrap().file_name().to_str().unwrap()))); + let digest_original = md5::compute(get_hash(&format!("{}", &path.unwrap().path().display()))); assert_eq!(format!("{:x}", digest_new), format!("{:x}", digest_original)); } @@ -307,7 +310,7 @@ fn testing_demultiplex() { assert_eq!(count_files_recursive(&ouput_dir), count_files_recursive(&original_path)); - if [7, 8, 9].contains(&ds_itr_tmp){ + if [7, 8, 9, 10].contains(&ds_itr_tmp){ break; }