Skip to content

Commit

Permalink
Merge pull request #46 from ComparativeGenomicsToolkit/chop
Browse files Browse the repository at this point in the history
Add softmasked only option to clip-vg
  • Loading branch information
glennhickey authored Jul 14, 2021
2 parents 3b34003 + ff2edf8 commit 446238b
Showing 1 changed file with 33 additions and 10 deletions.
43 changes: 33 additions & 10 deletions clip-vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ void help(char** argv) {
<< " -b, --bed FILE Intervals to clip in BED format" << endl
<< " -m, --min-length N Only clip paths of length < N" << endl
<< " -u, --max-unaligned N Clip out unaligned regions of length > N" << endl
<< " -s, --softmasked-only Only clip out unaligned *softmasked* (lowercase) regions with -u" << endl
<< " -e, --ref-prefix STR Forwardize (but don't clip) paths whose name begins with STR" << endl
<< " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl
<< " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl
Expand All @@ -41,7 +42,7 @@ void help(char** argv) {

static unordered_map<string, vector<pair<int64_t, int64_t>>> load_bed(istream& bed_stream, const string& ref_prefix);
static unordered_map<string, vector<pair<int64_t, int64_t>>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned,
const string& ref_prefix);
const string& ref_prefix, bool softmasked_only);
static unique_ptr<MutablePathMutableHandleGraph> load_graph(istream& graph_stream);
static vector<string> &split_delims(const string &s, const string& delims, vector<string> &elems);
static void chop_path_intervals(MutablePathMutableHandleGraph* graph,
Expand Down Expand Up @@ -108,6 +109,7 @@ int main(int argc, char** argv) {
string bed_path;
int64_t min_length = 0;
int64_t max_unaligned = 0;
bool unaligned_softmasked_only = false;
string ref_prefix;
size_t input_count = 0;
bool force_clip = false;
Expand All @@ -124,6 +126,7 @@ int main(int argc, char** argv) {
{"bed", required_argument, 0, 'b'},
{"min-length", required_argument, 0, 'm'},
{"max-unaligned", required_argument, 0, 'u'},
{"sofmasked-only", no_argument, 0, 's'},
{"ref-prefix", required_argument, 0, 'e'},
{"force-clip", no_argument, 0, 'f'},
{"name-replace", required_argument, 0, 'r'},
Expand All @@ -135,7 +138,7 @@ int main(int argc, char** argv) {

int option_index = 0;

c = getopt_long (argc, argv, "hpb:m:u:e:fnr:d:",
c = getopt_long (argc, argv, "hpb:m:u:se:fnr:d:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -156,6 +159,9 @@ int main(int argc, char** argv) {
max_unaligned = stol(optarg);
++input_count;
break;
case 's':
unaligned_softmasked_only = true;
break;
case 'e':
ref_prefix = optarg;
break;
Expand Down Expand Up @@ -213,6 +219,11 @@ int main(int argc, char** argv) {
return 1;
}

if (unaligned_softmasked_only && max_unaligned <= 0) {
cerr << "[clip-vg] error: -s must be used with -u" << endl;
return 1;
}

string graph_path = argv[optind++];
ifstream graph_stream(graph_path);
if (!graph_stream) {
Expand All @@ -225,6 +236,11 @@ int main(int argc, char** argv) {
cerr << "[clip-vg]: Loaded graph" << endl;
}

if (unaligned_softmasked_only && !dynamic_cast<bdsg::HashGraph*>(graph.get())) {
cerr << "[clip-vg]: -s option only works with HashGraph inputs (as it requires softmasked sequence)" << endl;
return 1;
}

unordered_map<string, vector<pair<int64_t, int64_t>>> bed_intervals;

if (!bed_path.empty()) {
Expand All @@ -251,7 +267,7 @@ int main(int argc, char** argv) {
});
} else if (max_unaligned != 0) {
// apply max unaligned length to all paths
bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix);
bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only);
}

if (progress) {
Expand Down Expand Up @@ -323,7 +339,7 @@ unordered_map<string, vector<pair<int64_t, int64_t>>> load_bed(istream& bed_stre
}

unordered_map<string, vector<pair<int64_t, int64_t>>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned,
const string& ref_prefix) {
const string& ref_prefix, bool softmasked_only) {
unordered_map<string, vector<pair<int64_t, int64_t>>> intervals;

graph->for_each_path_handle([&](path_handle_t path_handle) {
Expand All @@ -335,12 +351,19 @@ unordered_map<string, vector<pair<int64_t, int64_t>>> find_unaligned(const PathH
handle_t handle = graph->get_handle_of_step(step_handle);
int64_t len = (int64_t)graph->get_length(handle);
bool aligned = false;
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
if (graph->get_path_handle_of_step(step_handle_2) != path_handle) {
aligned = true;
}
return !aligned;
});
if (softmasked_only) {
// if this flag's on, we discount any handle that has an upper case character
string sequence = graph->get_sequence(handle);
aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); });
}
if (!aligned) {
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
if (graph->get_path_handle_of_step(step_handle_2) != path_handle) {
aligned = true;
}
return !aligned;
});
}
// start an unaligned interval
if (start < 0 && aligned == false) {
start = offset;
Expand Down

0 comments on commit 446238b

Please sign in to comment.