Skip to content

Commit

Permalink
Merge pull request #47 from ComparativeGenomicsToolkit/chop
Browse files Browse the repository at this point in the history
Fix up clip-vg softmask filter
  • Loading branch information
glennhickey authored Jul 21, 2021
2 parents 446238b + 494b0be commit a046174
Showing 1 changed file with 34 additions and 28 deletions.
62 changes: 34 additions & 28 deletions clip-vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void help(char** argv) {
<< " -b, --bed FILE Intervals to clip in BED format" << endl
<< " -m, --min-length N Only clip paths of length < N" << endl
<< " -u, --max-unaligned N Clip out unaligned regions of length > N" << endl
<< " -s, --softmasked-only Only clip out unaligned *softmasked* (lowercase) regions with -u" << endl
<< " -s, --masked-pct F Only clip out unaligned *softmasked* (lowercase) or hardmasked regions with -u, where F is the minimum fraction of softmasked bases to count [default: 0]" << endl
<< " -e, --ref-prefix STR Forwardize (but don't clip) paths whose name begins with STR" << endl
<< " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl
<< " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl
Expand All @@ -42,7 +42,7 @@ void help(char** argv) {

static unordered_map<string, vector<pair<int64_t, int64_t>>> load_bed(istream& bed_stream, const string& ref_prefix);
static unordered_map<string, vector<pair<int64_t, int64_t>>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned,
const string& ref_prefix, bool softmasked_only);
const string& ref_prefix, double softmasked_frac);
static unique_ptr<MutablePathMutableHandleGraph> load_graph(istream& graph_stream);
static vector<string> &split_delims(const string &s, const string& delims, vector<string> &elems);
static void chop_path_intervals(MutablePathMutableHandleGraph* graph,
Expand Down Expand Up @@ -109,7 +109,7 @@ int main(int argc, char** argv) {
string bed_path;
int64_t min_length = 0;
int64_t max_unaligned = 0;
bool unaligned_softmasked_only = false;
double unaligned_softmasked_frac = 0.;
string ref_prefix;
size_t input_count = 0;
bool force_clip = false;
Expand All @@ -126,7 +126,7 @@ int main(int argc, char** argv) {
{"bed", required_argument, 0, 'b'},
{"min-length", required_argument, 0, 'm'},
{"max-unaligned", required_argument, 0, 'u'},
{"sofmasked-only", no_argument, 0, 's'},
{"masked-pct", required_argument, 0, 's'},
{"ref-prefix", required_argument, 0, 'e'},
{"force-clip", no_argument, 0, 'f'},
{"name-replace", required_argument, 0, 'r'},
Expand All @@ -138,7 +138,7 @@ int main(int argc, char** argv) {

int option_index = 0;

c = getopt_long (argc, argv, "hpb:m:u:se:fnr:d:",
c = getopt_long (argc, argv, "hpb:m:u:s:e:fnr:d:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -160,8 +160,8 @@ int main(int argc, char** argv) {
++input_count;
break;
case 's':
unaligned_softmasked_only = true;
break;
unaligned_softmasked_frac = stof(optarg);
break;
case 'e':
ref_prefix = optarg;
break;
Expand Down Expand Up @@ -219,7 +219,7 @@ int main(int argc, char** argv) {
return 1;
}

if (unaligned_softmasked_only && max_unaligned <= 0) {
if (unaligned_softmasked_frac > 0 && max_unaligned <= 0) {
cerr << "[clip-vg] error: -s must be used with -u" << endl;
return 1;
}
Expand All @@ -236,7 +236,7 @@ int main(int argc, char** argv) {
cerr << "[clip-vg]: Loaded graph" << endl;
}

if (unaligned_softmasked_only && !dynamic_cast<bdsg::HashGraph*>(graph.get())) {
if (unaligned_softmasked_frac && !dynamic_cast<bdsg::HashGraph*>(graph.get())) {
cerr << "[clip-vg]: -s option only works with HashGraph inputs (as it requires softmasked sequence)" << endl;
return 1;
}
Expand Down Expand Up @@ -267,7 +267,7 @@ int main(int argc, char** argv) {
});
} else if (max_unaligned != 0) {
// apply max unaligned length to all paths
bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only);
bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_frac);
}

if (progress) {
Expand Down Expand Up @@ -339,53 +339,59 @@ unordered_map<string, vector<pair<int64_t, int64_t>>> load_bed(istream& bed_stre
}

unordered_map<string, vector<pair<int64_t, int64_t>>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned,
const string& ref_prefix, bool softmasked_only) {
const string& ref_prefix, double softmasked_frac) {
unordered_map<string, vector<pair<int64_t, int64_t>>> intervals;

graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = graph->get_path_name(path_handle);
if (ref_prefix.empty() || path_name.substr(0, ref_prefix.length()) != ref_prefix) {
int64_t offset = 0;
int64_t start = -1;
int64_t sm_count = 0;
graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
handle_t handle = graph->get_handle_of_step(step_handle);
int64_t len = (int64_t)graph->get_length(handle);
bool aligned = false;
if (softmasked_only) {
// if this flag's on, we discount any handle that has an upper case character
string sequence = graph->get_sequence(handle);
aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); });
}
if (!aligned) {
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
if (graph->get_path_handle_of_step(step_handle_2) != path_handle) {
aligned = true;
}
return !aligned;
});
}
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
if (graph->get_path_handle_of_step(step_handle_2) != path_handle) {
aligned = true;
}
return !aligned;
});
// start an unaligned interval
if (start < 0 && aligned == false) {
start = offset;
sm_count = 0;
}

// count the softmasked bases
if (!aligned && softmasked_frac > 0.) {
string sequence = graph->get_sequence(handle);
for (size_t i = 0; i < len; ++i) {
if (std::islower(sequence[i]) || sequence[i] == 'N') {
++sm_count;
}
}
}

// end an unaligned interval
if (aligned == true) {
if (start >= 0 && offset + len - start > max_unaligned) {
if (start >=0 && offset > max_unaligned && sm_count)
if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) {
intervals[path_name].push_back(make_pair(start, offset));
}
start = -1;
}
offset += len;
});
if (start >= 0 && offset - start > max_unaligned) {
if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) {
intervals[path_name].push_back(make_pair(start, offset));
}
}
});
return intervals;
}



unique_ptr<MutablePathMutableHandleGraph> load_graph(istream& graph_stream) {

char magic_bytes[4];
Expand Down

0 comments on commit a046174

Please sign in to comment.