diff --git a/clip-vg.cpp b/clip-vg.cpp index 9d461ce..62136f3 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -30,6 +30,7 @@ void help(char** argv) { << " -b, --bed FILE Intervals to clip in BED format" << endl << " -m, --min-length N Only clip paths of length < N" << endl << " -u, --max-unaligned N Clip out unaligned regions of length > N" << endl + << " -s, --softmasked-only Only clip out unaligned *softmasked* (lowercase) regions with -u" << endl << " -e, --ref-prefix STR Forwardize (but don't clip) paths whose name begins with STR" << endl << " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl << " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl @@ -41,7 +42,7 @@ void help(char** argv) { static unordered_map>> load_bed(istream& bed_stream, const string& ref_prefix); static unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix); + const string& ref_prefix, bool softmasked_only); static unique_ptr load_graph(istream& graph_stream); static vector &split_delims(const string &s, const string& delims, vector &elems); static void chop_path_intervals(MutablePathMutableHandleGraph* graph, @@ -108,6 +109,7 @@ int main(int argc, char** argv) { string bed_path; int64_t min_length = 0; int64_t max_unaligned = 0; + bool unaligned_softmasked_only = false; string ref_prefix; size_t input_count = 0; bool force_clip = false; @@ -124,6 +126,7 @@ int main(int argc, char** argv) { {"bed", required_argument, 0, 'b'}, {"min-length", required_argument, 0, 'm'}, {"max-unaligned", required_argument, 0, 'u'}, + {"sofmasked-only", no_argument, 0, 's'}, {"ref-prefix", required_argument, 0, 'e'}, {"force-clip", no_argument, 0, 'f'}, {"name-replace", required_argument, 0, 'r'}, @@ -135,7 +138,7 @@ int main(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "hpb:m:u:e:fnr:d:", + c = getopt_long (argc, argv, "hpb:m:u:se:fnr:d:", long_options, &option_index); // Detect the end of the options. @@ -156,6 +159,9 @@ int main(int argc, char** argv) { max_unaligned = stol(optarg); ++input_count; break; + case 's': + unaligned_softmasked_only = true; + break; case 'e': ref_prefix = optarg; break; @@ -213,6 +219,11 @@ int main(int argc, char** argv) { return 1; } + if (unaligned_softmasked_only && max_unaligned <= 0) { + cerr << "[clip-vg] error: -s must be used with -u" << endl; + return 1; + } + string graph_path = argv[optind++]; ifstream graph_stream(graph_path); if (!graph_stream) { @@ -225,6 +236,11 @@ int main(int argc, char** argv) { cerr << "[clip-vg]: Loaded graph" << endl; } + if (unaligned_softmasked_only && !dynamic_cast(graph.get())) { + cerr << "[clip-vg]: -s option only works with HashGraph inputs (as it requires softmasked sequence)" << endl; + return 1; + } + unordered_map>> bed_intervals; if (!bed_path.empty()) { @@ -251,7 +267,7 @@ int main(int argc, char** argv) { }); } else if (max_unaligned != 0) { // apply max unaligned length to all paths - bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix); + bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only); } if (progress) { @@ -323,7 +339,7 @@ unordered_map>> load_bed(istream& bed_stre } unordered_map>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned, - const string& ref_prefix) { + const string& ref_prefix, bool softmasked_only) { unordered_map>> intervals; graph->for_each_path_handle([&](path_handle_t path_handle) { @@ -335,12 +351,19 @@ unordered_map>> find_unaligned(const PathH handle_t handle = graph->get_handle_of_step(step_handle); int64_t len = (int64_t)graph->get_length(handle); bool aligned = false; - graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { - if (graph->get_path_handle_of_step(step_handle_2) != path_handle) { - aligned = true; - } - return !aligned; - }); + if (softmasked_only) { + // if this flag's on, we discount any handle that has an upper case character + string sequence = graph->get_sequence(handle); + aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); }); + } + if (!aligned) { + graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { + if (graph->get_path_handle_of_step(step_handle_2) != path_handle) { + aligned = true; + } + return !aligned; + }); + } // start an unaligned interval if (start < 0 && aligned == false) { start = offset;