From 90b795b1056feceb5475adc31d9d9be0afc1c600 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 21 Jul 2021 09:53:58 -0400 Subject: [PATCH 1/2] add softmasked any option --- clip-vg.cpp | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 62136f3..7d24b21 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -31,6 +31,7 @@ void help(char** argv) { << " -m, --min-length N Only clip paths of length < N" << endl << " -u, --max-unaligned N Clip out unaligned regions of length > N" << endl << " -s, --softmasked-only Only clip out unaligned *softmasked* (lowercase) regions with -u" << endl + << " -S, --softmasked-any Like -s, but clip any unaligned region containing any softmasked bases" << endl << " -e, --ref-prefix STR Forwardize (but don't clip) paths whose name begins with STR" << endl << " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl << " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl @@ -42,7 +43,8 @@ void help(char** argv) { static unordered_map>> load_bed(istream& bed_stream, const string& ref_prefix); static unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix, bool softmasked_only); + const string& ref_prefix, bool softmasked_only, + bool softmasked_any); static unique_ptr load_graph(istream& graph_stream); static vector &split_delims(const string &s, const string& delims, vector &elems); static void chop_path_intervals(MutablePathMutableHandleGraph* graph, @@ -110,6 +112,7 @@ int main(int argc, char** argv) { int64_t min_length = 0; int64_t max_unaligned = 0; bool unaligned_softmasked_only = false; + bool unaligned_softmasked_any = false; string ref_prefix; size_t input_count = 0; bool force_clip = false; @@ -127,6 +130,7 @@ int main(int argc, char** argv) { {"min-length", required_argument, 0, 'm'}, {"max-unaligned", required_argument, 0, 'u'}, {"sofmasked-only", no_argument, 0, 's'}, + {"sofmasked-any", no_argument, 0, 'S'}, {"ref-prefix", required_argument, 0, 'e'}, {"force-clip", no_argument, 0, 'f'}, {"name-replace", required_argument, 0, 'r'}, @@ -138,7 +142,7 @@ int main(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "hpb:m:u:se:fnr:d:", + c = getopt_long (argc, argv, "hpb:m:u:sSe:fnr:d:", long_options, &option_index); // Detect the end of the options. @@ -161,7 +165,10 @@ int main(int argc, char** argv) { break; case 's': unaligned_softmasked_only = true; - break; + break; + case 'S': + unaligned_softmasked_any = true; + break; case 'e': ref_prefix = optarg; break; @@ -224,6 +231,16 @@ int main(int argc, char** argv) { return 1; } + if (unaligned_softmasked_any && max_unaligned <= 0) { + cerr << "[clip-vg] error: -S must be used with -u" << endl; + return 1; + } + + if (unaligned_softmasked_any && unaligned_softmasked_only) { + cerr << "[clip-vg] error: -S and -s cannot be used together" << endl; + return 1; + } + string graph_path = argv[optind++]; ifstream graph_stream(graph_path); if (!graph_stream) { @@ -267,7 +284,7 @@ int main(int argc, char** argv) { }); } else if (max_unaligned != 0) { // apply max unaligned length to all paths - bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only); + bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only, unaligned_softmasked_any); } if (progress) { @@ -339,7 +356,8 @@ unordered_map>> load_bed(istream& bed_stre } unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix, bool softmasked_only) { + const string& ref_prefix, bool softmasked_only, + bool softmasked_any) { unordered_map>> intervals; graph->for_each_path_handle([&](path_handle_t path_handle) { @@ -351,10 +369,14 @@ unordered_map>> find_unaligned(const PathH handle_t handle = graph->get_handle_of_step(step_handle); int64_t len = (int64_t)graph->get_length(handle); bool aligned = false; - if (softmasked_only) { + if (softmasked_only || softmasked_any) { // if this flag's on, we discount any handle that has an upper case character string sequence = graph->get_sequence(handle); - aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); }); + if (softmasked_only) { + aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); }); + } else { + aligned = std::all_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); }); + } } if (!aligned) { graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { From 494b0beeef5bcd0981c2476efcdad6df7a77c062 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 21 Jul 2021 11:33:31 -0400 Subject: [PATCH 2/2] fix up softmask option --- clip-vg.cpp | 82 +++++++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 49 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 7d24b21..63c06ae 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -30,8 +30,7 @@ void help(char** argv) { << " -b, --bed FILE Intervals to clip in BED format" << endl << " -m, --min-length N Only clip paths of length < N" << endl << " -u, --max-unaligned N Clip out unaligned regions of length > N" << endl - << " -s, --softmasked-only Only clip out unaligned *softmasked* (lowercase) regions with -u" << endl - << " -S, --softmasked-any Like -s, but clip any unaligned region containing any softmasked bases" << endl + << " -s, --masked-pct F Only clip out unaligned *softmasked* (lowercase) or hardmasked regions with -u, where F is the minimum fraction of softmasked bases to count [default: 0]" << endl << " -e, --ref-prefix STR Forwardize (but don't clip) paths whose name begins with STR" << endl << " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl << " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl @@ -43,8 +42,7 @@ void help(char** argv) { static unordered_map>> load_bed(istream& bed_stream, const string& ref_prefix); static unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix, bool softmasked_only, - bool softmasked_any); + const string& ref_prefix, double softmasked_frac); static unique_ptr load_graph(istream& graph_stream); static vector &split_delims(const string &s, const string& delims, vector &elems); static void chop_path_intervals(MutablePathMutableHandleGraph* graph, @@ -111,8 +109,7 @@ int main(int argc, char** argv) { string bed_path; int64_t min_length = 0; int64_t max_unaligned = 0; - bool unaligned_softmasked_only = false; - bool unaligned_softmasked_any = false; + double unaligned_softmasked_frac = 0.; string ref_prefix; size_t input_count = 0; bool force_clip = false; @@ -129,8 +126,7 @@ int main(int argc, char** argv) { {"bed", required_argument, 0, 'b'}, {"min-length", required_argument, 0, 'm'}, {"max-unaligned", required_argument, 0, 'u'}, - {"sofmasked-only", no_argument, 0, 's'}, - {"sofmasked-any", no_argument, 0, 'S'}, + {"masked-pct", required_argument, 0, 's'}, {"ref-prefix", required_argument, 0, 'e'}, {"force-clip", no_argument, 0, 'f'}, {"name-replace", required_argument, 0, 'r'}, @@ -142,7 +138,7 @@ int main(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "hpb:m:u:sSe:fnr:d:", + c = getopt_long (argc, argv, "hpb:m:u:s:e:fnr:d:", long_options, &option_index); // Detect the end of the options. @@ -164,11 +160,8 @@ int main(int argc, char** argv) { ++input_count; break; case 's': - unaligned_softmasked_only = true; + unaligned_softmasked_frac = stof(optarg); break; - case 'S': - unaligned_softmasked_any = true; - break; case 'e': ref_prefix = optarg; break; @@ -226,21 +219,11 @@ int main(int argc, char** argv) { return 1; } - if (unaligned_softmasked_only && max_unaligned <= 0) { + if (unaligned_softmasked_frac > 0 && max_unaligned <= 0) { cerr << "[clip-vg] error: -s must be used with -u" << endl; return 1; } - if (unaligned_softmasked_any && max_unaligned <= 0) { - cerr << "[clip-vg] error: -S must be used with -u" << endl; - return 1; - } - - if (unaligned_softmasked_any && unaligned_softmasked_only) { - cerr << "[clip-vg] error: -S and -s cannot be used together" << endl; - return 1; - } - string graph_path = argv[optind++]; ifstream graph_stream(graph_path); if (!graph_stream) { @@ -253,7 +236,7 @@ int main(int argc, char** argv) { cerr << "[clip-vg]: Loaded graph" << endl; } - if (unaligned_softmasked_only && !dynamic_cast(graph.get())) { + if (unaligned_softmasked_frac && !dynamic_cast(graph.get())) { cerr << "[clip-vg]: -s option only works with HashGraph inputs (as it requires softmasked sequence)" << endl; return 1; } @@ -284,7 +267,7 @@ int main(int argc, char** argv) { }); } else if (max_unaligned != 0) { // apply max unaligned length to all paths - bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only, unaligned_softmasked_any); + bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_frac); } if (progress) { @@ -356,8 +339,7 @@ unordered_map>> load_bed(istream& bed_stre } unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix, bool softmasked_only, - bool softmasked_any) { + const string& ref_prefix, double softmasked_frac) { unordered_map>> intervals; graph->for_each_path_handle([&](path_handle_t path_handle) { @@ -365,49 +347,51 @@ unordered_map>> find_unaligned(const PathH if (ref_prefix.empty() || path_name.substr(0, ref_prefix.length()) != ref_prefix) { int64_t offset = 0; int64_t start = -1; + int64_t sm_count = 0; graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { handle_t handle = graph->get_handle_of_step(step_handle); int64_t len = (int64_t)graph->get_length(handle); bool aligned = false; - if (softmasked_only || softmasked_any) { - // if this flag's on, we discount any handle that has an upper case character - string sequence = graph->get_sequence(handle); - if (softmasked_only) { - aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); }); - } else { - aligned = std::all_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); }); - } - } - if (!aligned) { - graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { - if (graph->get_path_handle_of_step(step_handle_2) != path_handle) { - aligned = true; - } - return !aligned; - }); - } + graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { + if (graph->get_path_handle_of_step(step_handle_2) != path_handle) { + aligned = true; + } + return !aligned; + }); // start an unaligned interval if (start < 0 && aligned == false) { start = offset; + sm_count = 0; } + + // count the softmasked bases + if (!aligned && softmasked_frac > 0.) { + string sequence = graph->get_sequence(handle); + for (size_t i = 0; i < len; ++i) { + if (std::islower(sequence[i]) || sequence[i] == 'N') { + ++sm_count; + } + } + } + // end an unaligned interval if (aligned == true) { - if (start >= 0 && offset + len - start > max_unaligned) { + if (start >=0 && offset > max_unaligned && sm_count) + if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) { intervals[path_name].push_back(make_pair(start, offset)); } start = -1; } offset += len; }); - if (start >= 0 && offset - start > max_unaligned) { + if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) { intervals[path_name].push_back(make_pair(start, offset)); } } }); return intervals; } - - + unique_ptr load_graph(istream& graph_stream) { char magic_bytes[4];