-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgafkluge.hpp
326 lines (284 loc) · 11.8 KB
/
gafkluge.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#pragma once
/**
* \file gafkluge.hpp
*
* Barebones header-only standalone GAF parser reads GAF lines into and out of structs
* Named in honour of gfakluge
*/
#include <string>
#include <iostream>
#include <sstream>
#include <cstdint>
#include <vector>
#include <map>
#include <functional>
namespace gafkluge {
/**
* We allow pretty much any field to be set as "*" in the GAF, which gets mapped to a -1
* in the numeric fields (as there are no valid negative values)
*/
static const std::string missing_string = "*";
static const int64_t missing_int = -1;
inline std::string int_to_string(int64_t i) {
return i == missing_int ? missing_string : std::to_string(i);
}
inline int64_t string_to_int(const std::string& s) {
return s == missing_string ? missing_int : std::stol(s);
}
inline bool is_missing(const std::string& s) {
return s == missing_string;
}
inline bool is_missing(int64_t i) {
return i == missing_int;
}
/**
* One step of a GAF path
*/
struct GafStep {
std::string name; // Either a path name or segment/node name (see above)
bool is_reverse; // In reverse orientation ('<' in GAF)
bool is_stable; // True if it's a stable path name (as opposed to segment/node name)
bool is_interval; // True if it's an interval of a stable path (false if it's the whole path)
int64_t start; // 0-based start (inclusive). only defined if is_stable and is_interval are true
int64_t end; // 0-based end (inclusive). only defined if is_stable and is_interval are true
};
/**
* One line of GAF as described here: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md
*/
struct GafRecord {
std::string query_name; // Query sequence name
int64_t query_length; // Query sequence length
int64_t query_start; // 0-based, closed
int64_t query_end; // 0-based, open
int64_t path_length;
int64_t path_start; // Start position on the path (0-based)
int64_t path_end; // End position on the path (0-based)
int64_t matches; // Number of residue matches
int64_t block_length; // Alignment block length
int32_t mapq; // Mapping quality (0-255; 255 for missing)
char strand; // strand relative to the path + or -
std::vector<GafStep> path; // the path
// Map a tag name to its type and value
// ex: "de:f:0.2183" in the GAF would appear as opt_fields["de"] = ("f", "0.2183")
std::map<std::string, std::pair<std::string, std::string>> opt_fields;
// Init everything to missing
GafRecord() : query_length(missing_int), query_start(missing_int), query_end(missing_int),
strand(missing_string[0]), path_length(missing_int), path_start(missing_int),
path_end(missing_int), matches(missing_int), block_length(missing_int), mapq(255) {}
};
/**
* Parse a single GAF record
*/
inline void parse_gaf_record(const std::string& gaf_line, GafRecord& gaf_record) {
std::istringstream in(gaf_line);
std::string buffer;
int col = 1;
std::function<void(void)> scan_column = [&]() {
getline(in, buffer, '\t');
if (!in || buffer.empty()) {
throw std::runtime_error("Error parsing GAF column " + std::to_string(col));
}
++col;
};
scan_column();
gaf_record.query_name = std::move(buffer);
scan_column();
gaf_record.query_length = string_to_int(buffer);
scan_column();
gaf_record.query_start = string_to_int(buffer);
scan_column();
gaf_record.query_end = string_to_int(buffer);
scan_column();
if (buffer == "-" || buffer == missing_string || buffer == "+") {
gaf_record.strand = buffer[0];
} else {
throw std::runtime_error("Error parsing GAF strand: " + buffer);
}
gaf_record.path.clear();
scan_column();
if (buffer[0] == '<' || buffer[0] == '>') {
// our path is a list of oriented segments or intervales
size_t pos = 0;
size_t next;
do {
GafStep step;
pos = buffer.find_first_of("><", pos);
next = buffer.find_first_of("><", pos + 1);
std::string step_token = buffer.substr(pos, next != std::string::npos ? next - pos : std::string::npos);
size_t colon = step_token.find_first_of(':');
step.is_reverse = step_token[0] == '<';
if (colon == std::string::npos) {
// no colon, we interpret the step as a segID
step.name = step_token.substr(1);
step.is_stable = false;
step.is_interval = false;
} else {
// colon, we interpret the step as a stable path interval
step.name = step_token.substr(1, colon - 1);
step.is_stable = true;
step.is_interval = true;
size_t dash = step_token.find_first_of('-', colon);
if (dash == std::string::npos) {
throw std::runtime_error("Error parsing GAF range of " + step_token);
}
step.start = std::stol(step_token.substr(colon + 1, dash - colon));
step.end = std::stol(step_token.substr(dash + 1));
}
gaf_record.path.push_back(step);
pos = next;
} while (next != std::string::npos);
} else if (buffer != "*") {
// our path is a stable path name
gaf_record.path.resize(1);
gaf_record.path[0].name = buffer;
gaf_record.path[0].is_reverse = false;
gaf_record.path[0].is_stable = true;
gaf_record.path[0].is_interval = false;
}
scan_column();
gaf_record.path_length = string_to_int(buffer);
scan_column();
gaf_record.path_start = string_to_int(buffer);
scan_column();
gaf_record.path_end = string_to_int(buffer);
scan_column();
gaf_record.matches = string_to_int(buffer);
scan_column();
gaf_record.block_length = string_to_int(buffer);
scan_column();
if (buffer == missing_string) {
gaf_record.mapq = missing_int;
} else {
gaf_record.mapq = std::stoi(buffer);
if (gaf_record.mapq >= 255) {
gaf_record.mapq = missing_int;
}
}
gaf_record.opt_fields.clear();
do {
getline(in, buffer, '\t');
if (in && !buffer.empty()) {
size_t col1 = buffer.find_first_of(':');
size_t col2 = buffer.find_first_of(':', col1 + 1);
if (buffer.length() < 5 || col1 == std::string::npos || col2 == std::string::npos) {
throw std::runtime_error("Unable to parse optional tag " + buffer);
}
std::string tag = buffer.substr(0, col1);
std::string type = buffer.substr(col1 + 1, col2 - col1 - 1);
std::string val = buffer.substr(col2 + 1);
if (gaf_record.opt_fields.count(tag)) {
throw std::runtime_error("Duplicate optional field found: " + tag);
}
gaf_record.opt_fields[tag] = make_pair(type, val);
}
} while (bool(in));
}
/*
* Visit each CS cigar record as a string. CS cigars are described here:
* https://github.com/lh3/minimap2#the-cs-optional-tag
*/
inline void for_each_cs(const GafRecord& gaf_record, std::function<void(const std::string&)> fn) {
if (gaf_record.opt_fields.count("cs")) {
const std::string& cs_cigar = gaf_record.opt_fields.find("cs")->second.second;
size_t next;
for (size_t co = 0; co != std::string::npos; co = next) {
next = cs_cigar.find_first_of(":*-+", co + 1);
fn(cs_cigar.substr(co, next == std::string::npos ? std::string::npos : next - co));
}
}
}
/*
* Visit each vg cigar record as a string. cg cigars are described here:
* https://samtools.github.io/hts-specs/SAMv1.pdf
* |([0-9]+[MIDNSHPX=])+
*/
inline void for_each_cg(const GafRecord& gaf_record, std::function<void(const char&, const size_t&)> fn) {
if (gaf_record.opt_fields.count("cg")) {
const std::string& cg_cigar = gaf_record.opt_fields.find("cg")->second.second;
size_t next;
for (size_t co = 0; co != std::string::npos && co < cg_cigar.length(); co = next) {
next = cg_cigar.find_first_of("MIDNSHPX=", co);
assert(next != std::string::npos); // todo: better error
std::string cg_len = cg_cigar.substr(co, next - co);
size_t cg_len_i = std::stol(cg_len);
fn(cg_cigar[next], cg_len_i);
++next;
}
}
}
/*
* Generic cigar function that will visit cs cigars if present, but fall back on cg cigars otherwise
* Function takes in token {:*-+MIDNSHPX=}, length, query-string, target-string
* The latter two strings are only filled by cs records and will be left empty for cg
*/
inline void for_each_cigar(const GafRecord& gaf_record, std::function<void(const char&, const size_t&, const std::string&, const std::string&)> fn) {
if (gaf_record.opt_fields.count("cs")) {
for_each_cs(gaf_record, [&](const std::string& cs_str) {
if (cs_str[0] == ':') {
fn(cs_str[0], std::stol(cs_str.substr(1)), "", "");
} else if (cs_str[0] == '+') {
std::string qs = cs_str.substr(1);
fn(cs_str[0], qs.length(), qs, "");
} else if (cs_str[0] == '-') {
std::string ts = cs_str.substr(1);
fn(cs_str[0], ts.length(), "", ts);
} else if (cs_str[0] == '*') {
assert(cs_str.length() == 3);
fn(cs_str[0], 1, cs_str.substr(2,1), cs_str.substr(1,1));
} else {
assert(false);
}
});
} else {
for_each_cg(gaf_record, [&](const char& cg_tok, const size_t& cg_len) {
fn(cg_tok, cg_len, "", "");
});
}
}
/*
* Write a GAF Step to a stream
*/
inline std::ostream& operator<<(std::ostream& os, const gafkluge::GafStep& gaf_step) {
if (!gaf_step.is_stable || gaf_step.is_interval) {
os << (gaf_step.is_reverse ? "<" : ">");
}
os << gaf_step.name;
if (gaf_step.is_interval) {
os << ":" << gaf_step.start << "-" << gaf_step.end;
}
return os;
}
/**
* Write a GAF record to a stream
*/
inline std::ostream& operator<<(std::ostream& os, const gafkluge::GafRecord& gaf_record) {
os << (gaf_record.query_name.empty() ? gafkluge::missing_string : gaf_record.query_name) << "\t"
<< gafkluge::int_to_string(gaf_record.query_length) << "\t"
<< gafkluge::int_to_string(gaf_record.query_start) << "\t"
<< gafkluge::int_to_string(gaf_record.query_end) << "\t"
<< gaf_record.strand << "\t";
if (gaf_record.path.empty()) {
os << gafkluge::missing_string << "\t"
<< gafkluge::missing_string << "\t"
<< gafkluge::missing_string << "\t"
<< gafkluge::missing_string << "\t"
<< gafkluge::missing_string << "\t"
<< gafkluge::missing_string << "\t";
} else {
for (const gafkluge::GafStep& step : gaf_record.path) {
os << step;
}
os << "\t"
<< gafkluge::int_to_string(gaf_record.path_length) << "\t"
<< gafkluge::int_to_string(gaf_record.path_start) << "\t"
<< gafkluge::int_to_string(gaf_record.path_end) << "\t"
<< gafkluge::int_to_string(gaf_record.matches) << "\t"
<< gafkluge::int_to_string(gaf_record.block_length) << "\t";
}
os << (gaf_record.mapq == gafkluge::missing_int ? 255 : gaf_record.mapq);
for (const auto& it : gaf_record.opt_fields) {
os << "\t" << it.first << ":" << it.second.first << ":" << it.second.second;
}
return os;
}
} // namesapce gafkluge