Skip to content

Commit

Permalink
subpath renaming convention in vg chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed Oct 11, 2019
1 parent 7ee5daf commit ec263c1
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 20 deletions.
36 changes: 23 additions & 13 deletions src/algorithms/subgraph.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "subgraph.hpp"
#include "../path.hpp"

namespace vg {
namespace algorithms {
Expand Down Expand Up @@ -292,8 +293,10 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat
/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph
/// based on their order in the path position index provided by the source graph
/// will clear any path found in both graphs before writing the new steps into it
/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset
/// in the source graph (unless the subpath was not cut up at all)
void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph,
bool rename_discontinuous_path_chunks) {
bool subpath_naming) {
std::unordered_map<std::string, std::map<uint64_t, handle_t> > subpaths;
subgraph.for_each_handle([&](const handle_t& h) {
handlegraph::nid_t id = subgraph.get_id(h);
Expand All @@ -308,28 +311,33 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath
}
});

function<path_handle_t(const string&, bool, size_t&)> new_subpath =
[&subgraph](const string& path_name, bool is_circular, size_t& sub_i) {
while (true) {
string subpath_name = path_name + ".chunk" + std::to_string(sub_i++);
if (!subgraph.has_path(subpath_name)) {
return subgraph.create_path_handle(subpath_name, is_circular);
}
function<path_handle_t(const string&, bool, size_t)> new_subpath =
[&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) {
string subpath_name = Paths::make_subpath_name(path_name, subpath_offset);
if (subgraph.has_path(subpath_name)) {
subgraph.destroy_path(subgraph.get_path_handle(subpath_name));
}
return subgraph.create_path_handle(subpath_name, is_circular);
};

for (auto& subpath : subpaths) {
const std::string& path_name = subpath.first;
path_handle_t source_path_handle = source.get_path_handle(path_name);
// destroy the path if it exists
if (subgraph.has_path(path_name)) {
subgraph.destroy_path(subgraph.get_path_handle(path_name));
}
size_t chunk_num = 0;
// fill in the path information
path_handle_t path = subgraph.create_path_handle(path_name);
// create a new path. give it a subpath name if the flag's on and its smaller than original
path_handle_t path;
if (!subpath_naming || subpath.second.size() == source.get_step_count(source_path_handle) ||
subpath.second.empty()) {
path = subgraph.create_path_handle(path_name, source.get_is_circular(source_path_handle));
} else {
path = new_subpath(path_name, source.get_is_circular(source_path_handle), subpath.second.begin()->first);
}
for (auto p = subpath.second.begin(); p != subpath.second.end(); ++p) {
const handle_t& handle = p->second;
if (p != subpath.second.begin()) {
if (p != subpath.second.begin() && subpath_naming) {
auto prev = p;
--prev;
const handle_t& prev_handle = prev->second;
Expand All @@ -350,9 +358,11 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath
}
if (delta != cont_delta) {
// we have a discontinuity! we'll make a new path can continue from there
path = new_subpath(path_name, subgraph.get_is_circular(path), chunk_num);
assert(subgraph.get_step_count(path) > 0);
path = new_subpath(path_name, subgraph.get_is_circular(path), p->first);
}
}
//fill in the path information
subgraph.append_step(path, handle);
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/algorithms/subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat
/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph
/// based on their order in the path position index provided by the source graph
/// will clear any path found in both graphs before writing the new steps into it
/// if rename_discontinuous_path_chunks is true, after each detected path discontinuity in the
/// subgraph it will start a new path with a .chunki suffix
/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset
/// in the source graph (unless the subpath was not cut up at all)
void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph,
bool rename_discontinuous_path_chunks = false);
bool subpath_naming = false);

/// We can accumulate a subgraph without accumulating all the edges between its nodes
/// this helper ensures that we get the full set
Expand Down
22 changes: 21 additions & 1 deletion src/chunker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,27 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length
algorithms::add_connecting_edges_to_subgraph(*graph, subgraph);
}
algorithms::add_subpaths_to_subgraph(*graph, subgraph, true);


// merge back our reference path to use the old chopping code
// todo: work with subpaths somehow?
if (!subgraph.has_path(region.seq)) {
map<size_t, path_handle_t> ref_subpaths;
subgraph.for_each_path_handle([&](path_handle_t path_handle) {
string path_name = subgraph.get_path_name(path_handle);
auto res = Paths::parse_subpath_name(path_name);
if (get<0>(res) == true && get<1>(res) == region.seq) {
ref_subpaths[get<2>(res)] = path_handle;
}
});
path_handle_t new_ref_path = subgraph.create_path_handle(region.seq, graph->get_is_circular(path_handle));
for (auto& ref_subpath : ref_subpaths) {
subgraph.for_each_step_in_path(ref_subpath.second, [&] (step_handle_t subpath_step) {
subgraph.append_step(new_ref_path, subgraph.get_handle_of_step(subpath_step));
});
subgraph.destroy_path(ref_subpath.second);
}
}

// build the vg of the subgraph
subgraph.remove_orphan_edges();

Expand Down
19 changes: 19 additions & 0 deletions src/path.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,25 @@ const std::function<bool(const string&)> Paths::is_alt = [](const string& path_n

};

// Check if using subpath naming scheme. If it is return true,
// the root path name, and the offset (false otherwise)
tuple<bool, string, size_t> Paths::parse_subpath_name(const string& path_name) {
size_t tag_offset = path_name.rfind(".subpath");
if (tag_offset == string::npos || tag_offset + 7 == path_name.length()) {
return make_tuple(false, "", 0);
} else {
string offset_str = path_name.substr(tag_offset + 8);
size_t offset_val = std::stol(offset_str);
return make_tuple(true, path_name.substr(0, tag_offset), offset_val);
}
}

// Create a subpath name
string Paths::make_subpath_name(const string& path_name, size_t offset) {
return path_name + ".subpath" + std::to_string(offset);
}


mapping_t::mapping_t(void) : traversal(0), length(0), rank(1) { }

mapping_t::mapping_t(const Mapping& m) {
Expand Down
7 changes: 7 additions & 0 deletions src/path.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ class Paths {
// We used to use a regex but that's a very slow way to check a prefix.
const static function<bool(const string&)> is_alt;

// Check if using subpath naming scheme. If it is return true,
// the root path name, and the offset (false otherwise)
tuple<bool, string, size_t> static parse_subpath_name(const string& path_name);

// Create a subpath name
string static make_subpath_name(const string& path_name, size_t offset);

Paths(void);

// copy
Expand Down
2 changes: 1 addition & 1 deletion test/t/26_deconstruct.t
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ printf "x\t13\tGGAAATTTTCTGGAGTTCTATTATATT\tGGAAATTTTCTGGAGTTCTATTATATAAATTTTCTG
diff cyclic_tiny_decon.tsv cyclic_tiny_truth.tsv
is "$?" 0 "deconstruct correctly handles a cycle in the alt path"

vg chunk -x cyclic_tiny.xg -r 10:15 -c 1 > cycle.vg
vg find -x cyclic_tiny.xg -n 10 -n 11 -n 12 -n 13 -n 14 -n 15 -c 1 > cycle.vg
vg index cycle.vg -x cycle.xg
vg deconstruct cycle.xg -p y -e -t 1 > cycle_decon.vcf
is $(grep -v "#" cycle_decon.vcf | wc -l) 2 "cyclic reference deconstruction has correct number of variants"
Expand Down
4 changes: 2 additions & 2 deletions test/t/30_vg_chunk.t
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ is $(vg chunk -x x.xg -r 1 -c 0 | vg view - -j | jq .node | grep id | wc -l) 1 "

# Check that traces work on a GBWT
is $(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq .node | grep id | wc -l) 5 "id chunker traces correct chunk size"
is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x")' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT"
is "$(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x")' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it"
is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x.subpath0")' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT"
is "$(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x.subpath0")' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it"
is "$(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" 3 "chunker can extract a partial haplotype from a GBWT"

#check that n-chunking works
Expand Down

1 comment on commit ec263c1

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch glenn. View the full report here.

19 tests passed, 0 tests failed and 0 tests skipped in 13263 seconds

Tests produced 361 warnings. 361 were for lower-than-expected alignment scores

Please sign in to comment.