Skip to content

Commit

Permalink
Merge pull request #18 from DLBPointon/dp24_splitby
Browse files Browse the repository at this point in the history
Dp24 splitby
  • Loading branch information
DLBPointon authored May 22, 2024
2 parents c0555f2 + f0f6103 commit 68fa664
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 78 deletions.
9 changes: 8 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
[package]
name = "fasta_manipulation"
version = "0.1.2"
version = "0.1.3"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
clap = { version = "4.4.4", features = ["cargo"] }
colored = "2.0.4"
compare = "0.1.0"
csv = "1.3.0"
io = "0.0.2"
noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ Currently, this program has the following arguments:
This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner.
The count will be the upper limit, as there will be a left over number of records.

`splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE}`
This will generate files in `{outdir}/{fasta-file.prefix}/{data_type}/{input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa`

`splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']`

- split_by_size (NOT YET WRITTEN)

Expand Down Expand Up @@ -59,5 +61,10 @@ Currently, this program has the following arguments:

- GC percentage per scaffold + counts
- GC percentage whole genome
- N50 and N90
- L50
- GAP count and length (summary with average length)

`profile -f input.fasta -o outdir`

If there are other options that would be useful to any other teams, leave a message or issue.
40 changes: 40 additions & 0 deletions src/generics.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use noodles::fasta;
use noodles::fasta::record::Definition;
use std::error::Error;
use std::{collections::HashMap, fmt, io::BufRead, result, str};

Expand Down Expand Up @@ -41,3 +42,42 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
// Take a HashMap and return a Key only Vec
map.into_iter().map(|(k, _v)| k)
}

fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
let header_list: Vec<&str> = header.split(' ').collect();
let record_header = header_list[0];
Ok(record_header[1..].to_owned())
// let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();

// let first_run = re.captures(&header).ok_or("None")?;

// if first_run[0] == "None".to_owned() {
// let re = Regex::new(r"symbol:(\S+)").unwrap();
// let second_run = re.captures(&header).ok_or("None")?;
// if second_run[0] == "None".to_owned() {
// let re = Regex::new(r"(\(\S+\)) gene").unwrap();
// let third_run = re.captures(&header).ok_or("None")?;
// if third_run[0] == "None".to_owned() {
// Ok("NOCAPTUREDRESULT".to_string())
// } else {
// Ok(third_run[0].to_string())
// }
// } else {
// Ok(second_run[0].to_string())
// }
// } else {
// Ok(first_run[0].to_string())
// }
}

pub fn sanitise_header(old_header: &Definition) -> String {
let x = get_gene_symbol(old_header.to_string());

// Yeah i dont know either...
match x {
Ok(c) => c,
Err(e) => {
format!("Regex isnt good enough to capture header id: {}", e)
}
}
}
Loading

0 comments on commit 68fa664

Please sign in to comment.