diff --git a/clip-vg.cpp b/clip-vg.cpp index 62136f3..63c06ae 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -30,7 +30,7 @@ void help(char** argv) { << " -b, --bed FILE Intervals to clip in BED format" << endl << " -m, --min-length N Only clip paths of length < N" << endl << " -u, --max-unaligned N Clip out unaligned regions of length > N" << endl - << " -s, --softmasked-only Only clip out unaligned *softmasked* (lowercase) regions with -u" << endl + << " -s, --masked-pct F Only clip out unaligned *softmasked* (lowercase) or hardmasked regions with -u, where F is the minimum fraction of softmasked bases to count [default: 0]" << endl << " -e, --ref-prefix STR Forwardize (but don't clip) paths whose name begins with STR" << endl << " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl << " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl @@ -42,7 +42,7 @@ void help(char** argv) { static unordered_map>> load_bed(istream& bed_stream, const string& ref_prefix); static unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix, bool softmasked_only); + const string& ref_prefix, double softmasked_frac); static unique_ptr load_graph(istream& graph_stream); static vector &split_delims(const string &s, const string& delims, vector &elems); static void chop_path_intervals(MutablePathMutableHandleGraph* graph, @@ -109,7 +109,7 @@ int main(int argc, char** argv) { string bed_path; int64_t min_length = 0; int64_t max_unaligned = 0; - bool unaligned_softmasked_only = false; + double unaligned_softmasked_frac = 0.; string ref_prefix; size_t input_count = 0; bool force_clip = false; @@ -126,7 +126,7 @@ int main(int argc, char** argv) { {"bed", required_argument, 0, 'b'}, {"min-length", required_argument, 0, 'm'}, {"max-unaligned", required_argument, 0, 'u'}, - {"sofmasked-only", no_argument, 0, 's'}, + {"masked-pct", required_argument, 0, 's'}, {"ref-prefix", required_argument, 0, 'e'}, {"force-clip", no_argument, 0, 'f'}, {"name-replace", required_argument, 0, 'r'}, @@ -138,7 +138,7 @@ int main(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "hpb:m:u:se:fnr:d:", + c = getopt_long (argc, argv, "hpb:m:u:s:e:fnr:d:", long_options, &option_index); // Detect the end of the options. @@ -160,8 +160,8 @@ int main(int argc, char** argv) { ++input_count; break; case 's': - unaligned_softmasked_only = true; - break; + unaligned_softmasked_frac = stof(optarg); + break; case 'e': ref_prefix = optarg; break; @@ -219,7 +219,7 @@ int main(int argc, char** argv) { return 1; } - if (unaligned_softmasked_only && max_unaligned <= 0) { + if (unaligned_softmasked_frac > 0 && max_unaligned <= 0) { cerr << "[clip-vg] error: -s must be used with -u" << endl; return 1; } @@ -236,7 +236,7 @@ int main(int argc, char** argv) { cerr << "[clip-vg]: Loaded graph" << endl; } - if (unaligned_softmasked_only && !dynamic_cast(graph.get())) { + if (unaligned_softmasked_frac && !dynamic_cast(graph.get())) { cerr << "[clip-vg]: -s option only works with HashGraph inputs (as it requires softmasked sequence)" << endl; return 1; } @@ -267,7 +267,7 @@ int main(int argc, char** argv) { }); } else if (max_unaligned != 0) { // apply max unaligned length to all paths - bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only); + bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_frac); } if (progress) { @@ -339,7 +339,7 @@ unordered_map>> load_bed(istream& bed_stre } unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix, bool softmasked_only) { + const string& ref_prefix, double softmasked_frac) { unordered_map>> intervals; graph->for_each_path_handle([&](path_handle_t path_handle) { @@ -347,45 +347,51 @@ unordered_map>> find_unaligned(const PathH if (ref_prefix.empty() || path_name.substr(0, ref_prefix.length()) != ref_prefix) { int64_t offset = 0; int64_t start = -1; + int64_t sm_count = 0; graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { handle_t handle = graph->get_handle_of_step(step_handle); int64_t len = (int64_t)graph->get_length(handle); bool aligned = false; - if (softmasked_only) { - // if this flag's on, we discount any handle that has an upper case character - string sequence = graph->get_sequence(handle); - aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); }); - } - if (!aligned) { - graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { - if (graph->get_path_handle_of_step(step_handle_2) != path_handle) { - aligned = true; - } - return !aligned; - }); - } + graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { + if (graph->get_path_handle_of_step(step_handle_2) != path_handle) { + aligned = true; + } + return !aligned; + }); // start an unaligned interval if (start < 0 && aligned == false) { start = offset; + sm_count = 0; } + + // count the softmasked bases + if (!aligned && softmasked_frac > 0.) { + string sequence = graph->get_sequence(handle); + for (size_t i = 0; i < len; ++i) { + if (std::islower(sequence[i]) || sequence[i] == 'N') { + ++sm_count; + } + } + } + // end an unaligned interval if (aligned == true) { - if (start >= 0 && offset + len - start > max_unaligned) { + if (start >=0 && offset > max_unaligned && sm_count) + if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) { intervals[path_name].push_back(make_pair(start, offset)); } start = -1; } offset += len; }); - if (start >= 0 && offset - start > max_unaligned) { + if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) { intervals[path_name].push_back(make_pair(start, offset)); } } }); return intervals; } - - + unique_ptr load_graph(istream& graph_stream) { char magic_bytes[4];