Skip to content

Commit

Permalink
fix up softmask option
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed Jul 21, 2021
1 parent 90b795b commit 494b0be
Showing 1 changed file with 33 additions and 49 deletions.
82 changes: 33 additions & 49 deletions clip-vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ void help(char** argv) {
<< " -b, --bed FILE Intervals to clip in BED format" << endl
<< " -m, --min-length N Only clip paths of length < N" << endl
<< " -u, --max-unaligned N Clip out unaligned regions of length > N" << endl
<< " -s, --softmasked-only Only clip out unaligned *softmasked* (lowercase) regions with -u" << endl
<< " -S, --softmasked-any Like -s, but clip any unaligned region containing any softmasked bases" << endl
<< " -s, --masked-pct F Only clip out unaligned *softmasked* (lowercase) or hardmasked regions with -u, where F is the minimum fraction of softmasked bases to count [default: 0]" << endl
<< " -e, --ref-prefix STR Forwardize (but don't clip) paths whose name begins with STR" << endl
<< " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl
<< " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl
Expand All @@ -43,8 +42,7 @@ void help(char** argv) {

static unordered_map<string, vector<pair<int64_t, int64_t>>> load_bed(istream& bed_stream, const string& ref_prefix);
static unordered_map<string, vector<pair<int64_t, int64_t>>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned,
const string& ref_prefix, bool softmasked_only,
bool softmasked_any);
const string& ref_prefix, double softmasked_frac);
static unique_ptr<MutablePathMutableHandleGraph> load_graph(istream& graph_stream);
static vector<string> &split_delims(const string &s, const string& delims, vector<string> &elems);
static void chop_path_intervals(MutablePathMutableHandleGraph* graph,
Expand Down Expand Up @@ -111,8 +109,7 @@ int main(int argc, char** argv) {
string bed_path;
int64_t min_length = 0;
int64_t max_unaligned = 0;
bool unaligned_softmasked_only = false;
bool unaligned_softmasked_any = false;
double unaligned_softmasked_frac = 0.;
string ref_prefix;
size_t input_count = 0;
bool force_clip = false;
Expand All @@ -129,8 +126,7 @@ int main(int argc, char** argv) {
{"bed", required_argument, 0, 'b'},
{"min-length", required_argument, 0, 'm'},
{"max-unaligned", required_argument, 0, 'u'},
{"sofmasked-only", no_argument, 0, 's'},
{"sofmasked-any", no_argument, 0, 'S'},
{"masked-pct", required_argument, 0, 's'},
{"ref-prefix", required_argument, 0, 'e'},
{"force-clip", no_argument, 0, 'f'},
{"name-replace", required_argument, 0, 'r'},
Expand All @@ -142,7 +138,7 @@ int main(int argc, char** argv) {

int option_index = 0;

c = getopt_long (argc, argv, "hpb:m:u:sSe:fnr:d:",
c = getopt_long (argc, argv, "hpb:m:u:s:e:fnr:d:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -164,11 +160,8 @@ int main(int argc, char** argv) {
++input_count;
break;
case 's':
unaligned_softmasked_only = true;
unaligned_softmasked_frac = stof(optarg);
break;
case 'S':
unaligned_softmasked_any = true;
break;
case 'e':
ref_prefix = optarg;
break;
Expand Down Expand Up @@ -226,21 +219,11 @@ int main(int argc, char** argv) {
return 1;
}

if (unaligned_softmasked_only && max_unaligned <= 0) {
if (unaligned_softmasked_frac > 0 && max_unaligned <= 0) {
cerr << "[clip-vg] error: -s must be used with -u" << endl;
return 1;
}

if (unaligned_softmasked_any && max_unaligned <= 0) {
cerr << "[clip-vg] error: -S must be used with -u" << endl;
return 1;
}

if (unaligned_softmasked_any && unaligned_softmasked_only) {
cerr << "[clip-vg] error: -S and -s cannot be used together" << endl;
return 1;
}

string graph_path = argv[optind++];
ifstream graph_stream(graph_path);
if (!graph_stream) {
Expand All @@ -253,7 +236,7 @@ int main(int argc, char** argv) {
cerr << "[clip-vg]: Loaded graph" << endl;
}

if (unaligned_softmasked_only && !dynamic_cast<bdsg::HashGraph*>(graph.get())) {
if (unaligned_softmasked_frac && !dynamic_cast<bdsg::HashGraph*>(graph.get())) {
cerr << "[clip-vg]: -s option only works with HashGraph inputs (as it requires softmasked sequence)" << endl;
return 1;
}
Expand Down Expand Up @@ -284,7 +267,7 @@ int main(int argc, char** argv) {
});
} else if (max_unaligned != 0) {
// apply max unaligned length to all paths
bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_only, unaligned_softmasked_any);
bed_intervals = find_unaligned(graph.get(), max_unaligned, ref_prefix, unaligned_softmasked_frac);
}

if (progress) {
Expand Down Expand Up @@ -356,58 +339,59 @@ unordered_map<string, vector<pair<int64_t, int64_t>>> load_bed(istream& bed_stre
}

unordered_map<string, vector<pair<int64_t, int64_t>>> find_unaligned(const PathHandleGraph* graph, int64_t max_unaligned,
const string& ref_prefix, bool softmasked_only,
bool softmasked_any) {
const string& ref_prefix, double softmasked_frac) {
unordered_map<string, vector<pair<int64_t, int64_t>>> intervals;

graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = graph->get_path_name(path_handle);
if (ref_prefix.empty() || path_name.substr(0, ref_prefix.length()) != ref_prefix) {
int64_t offset = 0;
int64_t start = -1;
int64_t sm_count = 0;
graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
handle_t handle = graph->get_handle_of_step(step_handle);
int64_t len = (int64_t)graph->get_length(handle);
bool aligned = false;
if (softmasked_only || softmasked_any) {
// if this flag's on, we discount any handle that has an upper case character
string sequence = graph->get_sequence(handle);
if (softmasked_only) {
aligned = std::any_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); });
} else {
aligned = std::all_of(sequence.begin(), sequence.end(), [](unsigned char c){ return std::isupper(c); });
}
}
if (!aligned) {
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
if (graph->get_path_handle_of_step(step_handle_2) != path_handle) {
aligned = true;
}
return !aligned;
});
}
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
if (graph->get_path_handle_of_step(step_handle_2) != path_handle) {
aligned = true;
}
return !aligned;
});
// start an unaligned interval
if (start < 0 && aligned == false) {
start = offset;
sm_count = 0;
}

// count the softmasked bases
if (!aligned && softmasked_frac > 0.) {
string sequence = graph->get_sequence(handle);
for (size_t i = 0; i < len; ++i) {
if (std::islower(sequence[i]) || sequence[i] == 'N') {
++sm_count;
}
}
}

// end an unaligned interval
if (aligned == true) {
if (start >= 0 && offset + len - start > max_unaligned) {
if (start >=0 && offset > max_unaligned && sm_count)
if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) {
intervals[path_name].push_back(make_pair(start, offset));
}
start = -1;
}
offset += len;
});
if (start >= 0 && offset - start > max_unaligned) {
if (start >= 0 && offset - start > max_unaligned && (double)sm_count / (double)(offset-start) >= softmasked_frac) {
intervals[path_name].push_back(make_pair(start, offset));
}
}
});
return intervals;
}



unique_ptr<MutablePathMutableHandleGraph> load_graph(istream& graph_stream) {

char magic_bytes[4];
Expand Down

0 comments on commit 494b0be

Please sign in to comment.