From 561f3b7fb0d70bdc51e7491617904c81a028ecd7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 27 May 2022 10:42:37 -0400 Subject: [PATCH 1/2] clean up straggling cerrs in clip-vg --- clip-vg.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/clip-vg.cpp b/clip-vg.cpp index b5a3a5a..b4d94d1 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -320,9 +320,11 @@ int main(int argc, char** argv) { if (!out_bed_path.empty()) { unordered_map>> output_graph_intervals = get_path_intervals(graph.get()); +#ifdef debug for (const auto& xx : output_graph_intervals) { cerr << " got output intervals " << xx.first << " count = " << xx.second.size() << endl; } +#endif unordered_map>> clipped_graph_intervals = get_clipped_intervals(input_graph_intervals, output_graph_intervals); ofstream out_bed_file(out_bed_path); size_t icount = 0; @@ -1061,9 +1063,13 @@ static unordered_map>> get_clipped_interva if (!output_intervals.count(path_name)) { // path doesn't appear in output -> everything was clipped clipped_intervals[path_name].insert(clipped_intervals[path_name].end(), in_intervals.begin(), in_intervals.end()); +#ifdef debug cerr << "clippin everything for " << path_name << endl; +#endif } else { +#ifdef debug cerr << "doin frag clip for " << path_name << endl; +#endif const auto& out_intervals = output_intervals.at(path_name); // note: clipping here is fairly simple because output intervals are a subset // and nothing overlaps From 174436c482481a122fc786ed69e58fe85148f273 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 3 Nov 2022 14:21:47 -0400 Subject: [PATCH 2/2] add n stats --- count-vg-hap-cov.cpp | 54 +++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/count-vg-hap-cov.cpp b/count-vg-hap-cov.cpp index bcef027..5375350 100644 --- a/count-vg-hap-cov.cpp +++ b/count-vg-hap-cov.cpp @@ -164,8 +164,10 @@ int main(int argc, char** argv) { // depth stats (one per thread) vector> depth_base_counts(get_thread_count()); + vector> depth_nfree_base_counts(get_thread_count()); vector> depth_node_counts(get_thread_count()); vector> depth_base_counts_nonref(get_thread_count()); + vector> depth_nfree_base_counts_nonref(get_thread_count()); vector> depth_node_counts_nonref(get_thread_count()); // do counts for each graph arg @@ -209,19 +211,31 @@ int main(int argc, char** argv) { if (depth_base_counts[t].size() <= coverage) { depth_base_counts[t].resize(coverage + 1, 0); depth_node_counts[t].resize(coverage + 1, 0); + depth_nfree_base_counts[t].resize(coverage + 1, 0); } - depth_base_counts[t][coverage] += graph->get_length(handle); + int64_t node_len = graph->get_length(handle); + int64_t num_ns = 0; + string node_seq = graph->get_sequence(handle); + for (auto c : node_seq) { + if (c == 'N' || c == 'n') { + ++num_ns; + } + } + depth_base_counts[t][coverage] += node_len; + depth_nfree_base_counts[t][coverage] += node_len - num_ns; depth_node_counts[t][coverage] += 1; - + if (!ref && !ref_sample.empty()) { // update the nonref coverage int64_t coverage = sample_set.size(); if (depth_base_counts_nonref[t].size() <= coverage) { depth_base_counts_nonref[t].resize(coverage + 1, 0); depth_node_counts_nonref[t].resize(coverage + 1, 0); + depth_nfree_base_counts_nonref[t].resize(coverage + 1, 0); } - depth_base_counts_nonref[t][coverage] += graph->get_length(handle); - depth_node_counts_nonref[t][coverage] += 1; + depth_base_counts_nonref[t][coverage] += node_len; + depth_nfree_base_counts_nonref[t][coverage] += node_len - num_ns; + depth_node_counts_nonref[t][coverage] += 1; } }, true); @@ -236,18 +250,22 @@ int main(int argc, char** argv) { for (int64_t coverage = 0; coverage < depth_base_counts[t].size(); ++coverage) { if (depth_base_counts[0].size() <= coverage) { depth_base_counts[0].resize(coverage + 1, 0); + depth_nfree_base_counts[0].resize(coverage + 1, 0); depth_node_counts[0].resize(coverage + 1, 0); } depth_base_counts[0][coverage] += depth_base_counts[t][coverage]; + depth_nfree_base_counts[0][coverage] += depth_nfree_base_counts[t][coverage]; depth_node_counts[0][coverage] += depth_node_counts[t][coverage]; if (!ref_sample.empty()) { if (depth_base_counts_nonref[0].size() <= coverage) { depth_base_counts_nonref[0].resize(coverage + 1, 0); + depth_nfree_base_counts_nonref[0].resize(coverage + 1, 0); depth_node_counts_nonref[0].resize(coverage + 1, 0); } if (coverage < depth_base_counts_nonref[t].size()) { depth_base_counts_nonref[0][coverage] += depth_base_counts_nonref[t][coverage]; + depth_nfree_base_counts_nonref[0][coverage] += depth_nfree_base_counts_nonref[t][coverage]; depth_node_counts_nonref[0][coverage] += depth_node_counts_nonref[t][coverage]; } } @@ -278,36 +296,40 @@ int main(int argc, char** argv) { // cumulate from 0 vector node_counts_cumul = get_cumul(depth_node_counts[0]); vector base_counts_cumul = get_cumul(depth_base_counts[0]); + vector nfree_base_counts_cumul = get_cumul(depth_nfree_base_counts[0]); vector node_counts_nonref_cumul = get_cumul(depth_node_counts_nonref[0]); vector base_counts_nonref_cumul = get_cumul(depth_base_counts_nonref[0]); + vector nfree_base_counts_nonref_cumul = get_cumul(depth_nfree_base_counts_nonref[0]); //cumulate from end vector node_counts_lumuc = get_lumuc(depth_node_counts[0]); vector base_counts_lumuc = get_lumuc(depth_base_counts[0]); + vector nfree_base_counts_lumuc = get_lumuc(depth_nfree_base_counts[0]); vector node_counts_nonref_lumuc = get_lumuc(depth_node_counts_nonref[0]); vector base_counts_nonref_lumuc = get_lumuc(depth_base_counts_nonref[0]); + vector nfree_base_counts_nonref_lumuc = get_lumuc(depth_nfree_base_counts_nonref[0]); // print the results cout << "hap-depth" - << "\t" << "nodes" << "\t" << "bases" - << "\t" << "nodes-cumul" << "\t" <<"bases-cumul" - << "\t" << "nodes-cumul-rev" << "\t" << "bases-cumul-rev"; + << "\t" << "nodes" << "\t" << "bases" << "\t" << "non-n-bases" + << "\t" << "nodes-cumul" << "\t" <<"bases-cumul" << "\t" << "non-n-bases-cumul" + << "\t" << "nodes-cumul-rev" << "\t" << "bases-cumul-rev" << "\t" << "non-n-bases-cumul-rev"; if (!ref_sample.empty()) { - cout << "\t" << "nodes-nonref" << "\t" << "bases-nonref" - << "\t" << "nodes-cumul-nonref" << "\t" << "bases-cumul-nonref" - << "\t" << "nodes-cumul-rev-nonref" << "\t" << "bases-cumul-rev-nonref"; + cout << "\t" << "nodes-nonref" << "\t" << "bases-nonref" << "\t" << "non-n-bases-nonref" + << "\t" << "nodes-cumul-nonref" << "\t" << "bases-cumul-nonref" << "\t" << "non-n-bases-cumul-nonref" + << "\t" << "nodes-cumul-rev-nonref" << "\t" << "bases-cumul-rev-nonref" << "\t" << "non-n-bases-cumul-rev-nonref"; } cout << endl; for (int64_t coverage = 0; coverage < depth_base_counts[0].size(); ++coverage) { cout << coverage - << "\t" << depth_node_counts[0][coverage] << "\t" << depth_base_counts[0][coverage] - << "\t" << node_counts_cumul[coverage] << "\t" << base_counts_cumul[coverage] - << "\t" << node_counts_lumuc[coverage] << "\t" << base_counts_lumuc[coverage]; + << "\t" << depth_node_counts[0][coverage] << "\t" << depth_base_counts[0][coverage] << "\t" << depth_nfree_base_counts[0][coverage] + << "\t" << node_counts_cumul[coverage] << "\t" << base_counts_cumul[coverage] << "\t" << nfree_base_counts_cumul[coverage] + << "\t" << node_counts_lumuc[coverage] << "\t" << base_counts_lumuc[coverage] << "\t" << nfree_base_counts_lumuc[coverage]; if (!ref_sample.empty()) { - cout << "\t" << depth_node_counts_nonref[0][coverage] << "\t" << depth_base_counts_nonref[0][coverage] - << "\t" << node_counts_nonref_cumul[coverage] << "\t" << base_counts_nonref_cumul[coverage] - << "\t" << node_counts_nonref_lumuc[coverage] << "\t" << base_counts_nonref_lumuc[coverage]; + cout << "\t" << depth_node_counts_nonref[0][coverage] << "\t" << depth_base_counts_nonref[0][coverage] << "\t" << depth_nfree_base_counts_nonref[0][coverage] + << "\t" << node_counts_nonref_cumul[coverage] << "\t" << base_counts_nonref_cumul[coverage] << "\t" << nfree_base_counts_nonref_cumul[coverage] + << "\t" << node_counts_nonref_lumuc[coverage] << "\t" << base_counts_nonref_lumuc[coverage] << "\t" << nfree_base_counts_nonref_lumuc[coverage]; } cout << "\n"; }