-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpafcoverage.cpp
105 lines (90 loc) · 3.6 KB
/
pafcoverage.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include <functional>
#include <cassert>
#include <iostream>
#include "pafcoverage.hpp"
using namespace std;
void update_coverage_map(const string& paf_line, CoverageMap& coverage_map) {
// split into array of tokens
vector<string> toks;
split_delims(paf_line, "\t\n", toks);
if (toks.size() < 12) {
throw runtime_error("too few tokens in PAF line: " + paf_line);
}
string& query_name = toks[0];
int64_t query_length = stol(toks[1]);
vector<bool>& query_coverage = coverage_map[query_name];
if (query_coverage.empty()) {
query_coverage.resize(query_length, false);
}
assert(query_coverage.size() == query_length);
for (int i = 12; i < toks.size(); ++i) {
if (toks[i].substr(0, 5) == "cg:Z:") {
// note: query coordinates always on forward strand in paf
int64_t query_pos = stol(toks[2]);
for_each_cg(toks[i], [&](const string& val, const string& cat) {
int64_t len = stol(val);
if (cat == "M" || cat == "=" || cat == "X") {
for (int64_t j = 0; j < len; ++j) {
query_coverage[query_pos + j] = true;
}
}
if (cat == "M" || cat == "=" || cat == "X" || cat == "I") {
query_pos += len;
}
});
}
}
}
void print_coverage_summary(const CoverageMap& coverage_map, ostream& out) {
out << "query-name" << "\t" << "pct-coverage" << "\t" << "max-gap" << "\t" << "avg-gap" << endl;
out << "----------" << "\t" << "------------" << "\t" << "-------" << "\t" << "-------" << endl;
for (const auto& kv : coverage_map) {
const string& query_name = kv.first;
const vector<bool>& coverage = kv.second;
int64_t last_covered = -1;
int64_t coverage_count = 0;
vector<int64_t> gaps;
for (int64_t i = 0; i < coverage.size(); ++i) {
if (coverage[i] == true) {
++coverage_count;
if (i - last_covered > 1) {
gaps.push_back(i - last_covered - 1);
}
last_covered = i;
}
}
int64_t i = coverage.size();
if (i - last_covered > 1) {
gaps.push_back(i - last_covered - 1);
}
int64_t max_gap = 0;
int64_t total_gap = 0;
for (int i = 0; i < gaps.size(); ++i) {
max_gap = max(gaps[i], max_gap);
total_gap += gaps[i];
}
out << query_name << "\t"
<< ((float)coverage_count / coverage.size()) << "\t"
<< max_gap << "\t"
<< (!gaps.empty() ? (total_gap / gaps.size()) : 0) << endl;
}
}
void print_coverage_gaps_as_bed(const CoverageMap& coverage_map, ostream& out, int64_t min_gap_length) {
for (const auto& kv : coverage_map) {
const string& query_name = kv.first;
const vector<bool>& coverage = kv.second;
int64_t last_covered = -1;
for (int64_t i = 0; i < coverage.size(); ++i) {
if (coverage[i] == true) {
if (i - last_covered > min_gap_length) {
out << query_name << "\t" << (last_covered + 1) << "\t" << i << "\t" << "pafcoverage-m" << min_gap_length << endl;
}
last_covered = i;
}
}
int64_t i = coverage.size();
if (i - last_covered > min_gap_length) {
out << query_name << "\t" << (last_covered + 1) << "\t" << i << "\t" << "pafcoverage-m" << min_gap_length << endl;
}
}
}