Merge pull request #18 from DLBPointon/dp24_splitby

Dp24 splitby
Rust-Wellcome · May 22, 2024 · 68fa664 · 68fa664
2 parents c0555f2 + f0f6103
commit 68fa664
Show file tree

Hide file tree

Showing 6 changed files with 188 additions and 78 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,13 +1,14 @@
 [package]
 name = "fasta_manipulation"
-version = "0.1.2"
+version = "0.1.3"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
 clap = { version = "4.4.4", features = ["cargo"] }
 colored = "2.0.4"
+compare = "0.1.0"
 csv = "1.3.0"
 io = "0.0.2"
 noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }

diff --git a/README.md b/README.md
@@ -26,7 +26,9 @@ Currently, this program has the following arguments:
     This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner.
     The count will be the upper limit, as there will be a left over number of records.
 
-    `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE}`
+    This will generate files in `{outdir}/{fasta-file.prefix}/{data_type}/{input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa`
+
+    `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']`
 
 -   split_by_size (NOT YET WRITTEN)
 
@@ -59,5 +61,10 @@ Currently, this program has the following arguments:
 
     -   GC percentage per scaffold + counts
     -   GC percentage whole genome
+    -   N50 and N90
+    -   L50
+    -   GAP count and length (summary with average length)
+
+    `profile -f input.fasta -o outdir`
 
 If there are other options that would be useful to any other teams, leave a message or issue.
diff --git a/src/generics.rs b/src/generics.rs
@@ -1,4 +1,5 @@
 use noodles::fasta;
+use noodles::fasta::record::Definition;
 use std::error::Error;
 use std::{collections::HashMap, fmt, io::BufRead, result, str};
 
@@ -41,3 +42,42 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
     // Take a HashMap and return a Key only Vec
     map.into_iter().map(|(k, _v)| k)
 }
+
+fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
+    let header_list: Vec<&str> = header.split(' ').collect();
+    let record_header = header_list[0];
+    Ok(record_header[1..].to_owned())
+    // let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();
+
+    // let first_run = re.captures(&header).ok_or("None")?;
+
+    // if first_run[0] == "None".to_owned() {
+    //     let re = Regex::new(r"symbol:(\S+)").unwrap();
+    //     let second_run = re.captures(&header).ok_or("None")?;
+    //     if second_run[0] == "None".to_owned() {
+    //         let re = Regex::new(r"(\(\S+\)) gene").unwrap();
+    //         let third_run = re.captures(&header).ok_or("None")?;
+    //         if third_run[0] == "None".to_owned() {
+    //             Ok("NOCAPTUREDRESULT".to_string())
+    //         } else {
+    //             Ok(third_run[0].to_string())
+    //         }
+    //     } else {
+    //         Ok(second_run[0].to_string())
+    //     }
+    // } else {
+    //     Ok(first_run[0].to_string())
+    // }
+}
+
+pub fn sanitise_header(old_header: &Definition) -> String {
+    let x = get_gene_symbol(old_header.to_string());
+
+    // Yeah i dont know either...
+    match x {
+        Ok(c) => c,
+        Err(e) => {
+            format!("Regex isnt good enough to capture header id: {}", e)
+        }
+    }
+}