From 02124fabdbc29c601c48b4850f0f19623f08c7af Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Jan 2021 13:13:14 -0500 Subject: [PATCH 01/23] add node chopper tool --- Makefile | 12 +- chop-vg-paths.cpp | 329 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 338 insertions(+), 3 deletions(-) create mode 100644 chop-vg-paths.cpp diff --git a/Makefile b/Makefile index 82f486a..53319f7 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ rootPath = ./ include ./include.mk -all : hal2vg +all : hal2vg chop-vg-paths # Note: hdf5 from apt doesn't seem to work for static builds. It should be installed # from source and configured with "--enable-static --disable-shared", then have its @@ -20,10 +20,10 @@ else endif cleanFast : - rm -f hal2vg hal2vg.o + rm -f hal2vg hal2vg.o chop-vg-paths chop-vg-paths.o clean : - rm -f hal2vg hal2vg.o + rm -f hal2vg hal2vg.o chop-vg-paths chop-vg-paths.o cd deps/sonLib && make clean cd deps/pinchesAndCacti && make clean cd deps/hal && make clean @@ -47,5 +47,11 @@ ${libbdsgPath}/lib/libbdsg.a : hal2vg : hal2vg.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread hal2vg.o ${basicLibs} -o hal2vg +chop-vg-paths.o : chop-vg-paths.cpp ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -I . chop-vg-paths.cpp -c + +chop-vg-paths : chop-vg-paths.o ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -fopenmp -pthread chop-vg-paths.o ${basicLibs} -o chop-vg-paths + test : hal2vg cd tests && prove -v small.t diff --git a/chop-vg-paths.cpp b/chop-vg-paths.cpp new file mode 100644 index 0000000..a48f34a --- /dev/null +++ b/chop-vg-paths.cpp @@ -0,0 +1,329 @@ +// Chop regions (from BED File) out of paths in vg graphs, creating subpath names and cutting out nodes or parts of nodes +// Assumes that: +// - regions don't overlap (error otherwise) +// - regions are only ever part of at most one path each (assert false otherwise) + +//#define debug + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "bdsg/odgi.hpp" + +using namespace std; +using namespace handlegraph; +using namespace bdsg; + +void help(char** argv) { + cerr << "usage: " << argv[0] << " [options] " << endl + << "Chop out path intervals from a vg graph" << endl + << endl; +} + +static unordered_map>> load_bed(istream& bed_stream); +static unique_ptr load_graph(istream& graph_stream); +static vector &split_delims(const string &s, const string& delims, vector &elems); +static void chop_path_intervals(MutablePathMutableHandleGraph* graph, + const unordered_map>>& bed_intervals); +static vector chop_path(MutablePathMutableHandleGraph* graph, + path_handle_t path_handle, + const vector>& intervals); +// Create a subpath name (todo: make same function in vg consistent (it only includes start)) +static inline string make_subpath_name(const string& path_name, size_t offset, size_t length) { + return path_name + "[" + std::to_string(offset) + ":" + std::to_string(offset + length) + "]"; +} + +int main(int argc, char** argv) { + + int c; + optind = 1; + while (true) { + + static const struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "h", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'h': + case '?': + /* getopt_long already printed an error message. */ + help(argv); + exit(1); + break; + default: + abort (); + } + } + + if (argc <= 1) { + help(argv); + return 1; + } + + // Parse the positional argument + if (optind >= argc) { + cerr << "[chop-vg-paths] error: too few arguments" << endl; + help(argv); + return 1; + } + + if (optind != argc - 2) { + cerr << "[chop-vg-paths] error: too many arguments" << endl; + help(argv); + return 1; + } + + string graph_path = argv[optind++]; + string bed_path = argv[optind++]; + + ifstream bed_stream(bed_path); + if (!bed_stream) { + cerr << "[chop-vg-paths] error: Unable to open input BED file " << bed_path << endl; + return 1; + } + unordered_map>> bed_intervals = load_bed(bed_stream); + bed_stream.close(); + + ifstream graph_stream(graph_path); + if (!graph_stream) { + cerr << "[chop-vg-paths] error: Unable to open input graph " << graph_path << endl; + return 1; + } + unique_ptr graph = load_graph(graph_stream); + graph_stream.close(); + + chop_path_intervals(graph.get(), bed_intervals); + + dynamic_cast(graph.get())->serialize(cout); + + return 0; +} + +unordered_map>> load_bed(istream& bed_stream) { + // load bed + unordered_map>> intervals; + string buffer; + while (getline(bed_stream, buffer)) { + vector toks; + split_delims(buffer, "\t\n", toks); + string& name = toks[0]; + int64_t start = stol(toks[1]); + int64_t end = stol(toks[2]); + if (toks.size() >= 3) { + intervals[name].push_back(make_pair(start, end)); + } + } + // verify bed + for (auto& seq_intervals : intervals) { + sort(seq_intervals.second.begin(), seq_intervals.second.end(), + [](const pair& b1, const pair& b2) { + return b1.first < b2.first || (b1.first == b2.first && b1.second < b2.second); + }); + for (size_t i = 1; i < seq_intervals.second.size(); ++i) { + if (seq_intervals.second[i].first < seq_intervals.second[i-1].second) { + cerr << "Overlapping bed intervals found:\n" + << " " << seq_intervals.first << "\t" + << seq_intervals.second[i-1].first << "\t" + << seq_intervals.second[i-1].second << endl + << " " << seq_intervals.first << "\t" + << seq_intervals.second[i].first << "\t" + << seq_intervals.second[i].second << endl + << "These are not supported. Please clean up (ex with bedools merge) first" << endl; + exit(1); + } + } + } + return intervals; +} + +unique_ptr load_graph(istream& graph_stream) { + + char magic_bytes[4]; + graph_stream.read(magic_bytes, 4); + uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes)); + graph_stream.clear(); + graph_stream.seekg(0, ios::beg); + + MutablePathMutableHandleGraph* graph; + if (magic_number == PackedGraph().get_magic_number()) { + graph = new PackedGraph(); + } else if (magic_number == HashGraph().get_magic_number()) { + graph = new HashGraph(); + } else if (magic_number == ODGI().get_magic_number()) { + graph = new ODGI(); + } else { + cerr << "Unable to parse input graph with magic number " << magic_number << endl; + exit(1); + } + dynamic_cast(graph)->deserialize(graph_stream); + + return unique_ptr(graph); +} + +vector &split_delims(const string &s, const string& delims, vector &elems) { + size_t start = string::npos; + for (size_t i = 0; i < s.size(); ++i) { + if (delims.find(s[i]) != string::npos) { + if (start != string::npos && i > start) { + elems.push_back(s.substr(start, i - start)); + } + start = string::npos; + } else if (start == string::npos) { + start = i; + } + } + if (start != string::npos && start < s.size()) { + elems.push_back(s.substr(start, s.size() - start)); + } + return elems; +} + +void chop_path_intervals(MutablePathMutableHandleGraph* graph, + const unordered_map>>& bed_intervals) { + // careful not to iterate and chop, as we could hit new subpaths made + vector path_handles; + graph->for_each_path_handle([&](path_handle_t path_handle) { + path_handles.push_back(path_handle); + }); + + for (auto path_handle : path_handles) { + string path_name = graph->get_path_name(path_handle); + auto it = bed_intervals.find(path_name); + if (it != bed_intervals.end()) { + vector chopped_handles = chop_path(graph, path_handle, it->second); + if (!chopped_handles.empty()) { + graph->destroy_path(path_handle); + for (handle_t handle : chopped_handles) { + assert(graph->steps_of_handle(handle).empty()); + dynamic_cast(graph)->destroy_handle(handle); + } + } + } + } +} + +vector chop_path(MutablePathMutableHandleGraph* graph, + path_handle_t path_handle, + const vector>& intervals) { + + // get the breakpoints + set breakpoints; + for (const pair& interval : intervals) { + breakpoints.insert(interval.first); + breakpoints.insert(interval.second); // we're cutting before offset, so the open coordinate is what we want + } + + // to be safe, don't cut and iterate at the same time, so load up steps here + vector steps; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + steps.push_back(graph->get_handle_of_step(step_handle)); + }); + + // cut the nodes to ensure breakpoints at node boundaries + int64_t offset = 0; + for (auto handle : steps) { + int64_t len = graph->get_length(handle); + // find breakpoints in node + vector cut_points; + for (auto i = breakpoints.lower_bound(offset); i != breakpoints.end() && *i - offset < len; ++i) { + cut_points.push_back(*i - offset); + } + // chop the node +#ifdef debug + if (!cut_points.empty()) { + cerr << "dividing node_id=" << graph->get_id(handle) << " for path " << graph->get_path_name(path_handle) << " at cut points:"; + for (auto cp : cut_points) { + cerr << " " << cp; + } + cerr << endl; + } +#endif + graph->divide_handle(handle, cut_points); + offset += len; + } + + steps.clear(); + int64_t original_path_length = offset; + auto fbi = breakpoints.upper_bound(0); + vector chopped_handles; + if (fbi == breakpoints.end() || *fbi >= original_path_length) { + // nothing to chop + return chopped_handles; + } + offset = 0; + step_handle_t current_step = graph->path_begin(path_handle); + vector subpaths; + + // cut out a subpath and make a new path out of it + function cut_to = [&](int64_t end_offset) { +#ifdef debug + cerr << "\ncut_to " << end_offset << " where current offset is " << offset << endl; +#endif + vector steps; + int64_t start_offset = offset; + int64_t path_length = 0; + while (offset < end_offset && current_step != graph->path_end(path_handle)) { + handle_t handle = graph->get_handle_of_step(current_step); + steps.push_back(handle); +#ifdef debug + cerr << " pushing subpath step " << graph->get_id(handle) << " len=" << graph->get_length(handle) << endl; +#endif + offset += graph->get_length(handle); + current_step = graph->get_next_step(current_step); + path_length += graph->get_length(handle); + } +#ifdef debug + cerr << "start offset=" << start_offset << " path length=" << path_length << " end offset=" << end_offset << endl; +#endif + assert(start_offset + path_length == end_offset); + + if (path_length > 0) { + path_handle_t subpath_handle = graph->create_path_handle(make_subpath_name(graph->get_path_name(path_handle), start_offset, path_length)); + for (auto step : steps) { + graph->append_step(subpath_handle, step); + } + subpaths.push_back(subpath_handle); + } + }; + + for (size_t i = 0; i < intervals.size(); ++i) { + if (intervals[i].first > offset) { + // cut everythign left of the interval + cut_to(intervals[i].first); + } + // scan past the interval + while (offset < intervals[i].second && current_step != graph->path_end(path_handle)) { + handle_t handle = graph->get_handle_of_step(current_step); + offset += graph->get_length(handle); + current_step = graph->get_next_step(current_step); + chopped_handles.push_back(handle); + } + } + + // cut the last bit + if (offset < original_path_length - 1) { + cut_to(original_path_length); + } + + assert (!subpaths.empty()); + return chopped_handles; +} From b8ef6771e027fd33e47f11030bc191783aca90d2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Jan 2021 17:09:18 -0500 Subject: [PATCH 02/23] add chop tests --- Makefile | 2 +- chop-vg-paths.cpp | 37 ++++++++++++++++-------- tests/t/chop.t | 74 +++++++++++++++++++++++++++++++++++++++++++++++ tests/t/small.t | 22 ++++++++++++++ 4 files changed, 122 insertions(+), 13 deletions(-) create mode 100644 tests/t/chop.t create mode 100644 tests/t/small.t diff --git a/Makefile b/Makefile index 53319f7..df604be 100644 --- a/Makefile +++ b/Makefile @@ -54,4 +54,4 @@ chop-vg-paths : chop-vg-paths.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread chop-vg-paths.o ${basicLibs} -o chop-vg-paths test : hal2vg - cd tests && prove -v small.t + cd tests && prove -v t diff --git a/chop-vg-paths.cpp b/chop-vg-paths.cpp index a48f34a..3f59853 100644 --- a/chop-vg-paths.cpp +++ b/chop-vg-paths.cpp @@ -125,10 +125,10 @@ unordered_map>> load_bed(istream& bed_stre while (getline(bed_stream, buffer)) { vector toks; split_delims(buffer, "\t\n", toks); - string& name = toks[0]; - int64_t start = stol(toks[1]); - int64_t end = stol(toks[2]); if (toks.size() >= 3) { + string& name = toks[0]; + int64_t start = stol(toks[1]); + int64_t end = stol(toks[2]); intervals[name].push_back(make_pair(start, end)); } } @@ -245,32 +245,43 @@ vector chop_path(MutablePathMutableHandleGraph* graph, // find breakpoints in node vector cut_points; for (auto i = breakpoints.lower_bound(offset); i != breakpoints.end() && *i - offset < len; ++i) { - cut_points.push_back(*i - offset); + int64_t cut_point = *i - offset; + // libbdsg is buggy and can't accept cutpoints on ends on reverse strand + if (cut_point > 0 && cut_point < len) { + cut_points.push_back(cut_point); + } } // chop the node #ifdef debug if (!cut_points.empty()) { - cerr << "dividing node_id=" << graph->get_id(handle) << " for path " << graph->get_path_name(path_handle) << " at cut points:"; + cerr << "dividing node_id=" << graph->get_id(handle) << ":" << graph->get_is_reverse(handle) << " seq=" << graph->get_sequence(handle) + << " for path " << graph->get_path_name(path_handle) << " at cut points:"; for (auto cp : cut_points) { cerr << " " << cp; } cerr << endl; } #endif +#ifdef debug + vector pieces = graph->divide_handle(handle, cut_points) ; + for (auto piece : pieces) { + cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) << endl; + } +#else graph->divide_handle(handle, cut_points); +#endif offset += len; } steps.clear(); int64_t original_path_length = offset; - auto fbi = breakpoints.upper_bound(0); vector chopped_handles; - if (fbi == breakpoints.end() || *fbi >= original_path_length) { - // nothing to chop - return chopped_handles; - } offset = 0; step_handle_t current_step = graph->path_begin(path_handle); +#ifdef debug + cerr << "init step to " << graph->get_id(graph->get_handle_of_step(current_step)) << ":" << graph->get_is_reverse(graph->get_handle_of_step(current_step)) + << " seq=" <get_sequence(graph->get_handle_of_step(current_step)) << endl; +#endif vector subpaths; // cut out a subpath and make a new path out of it @@ -315,15 +326,17 @@ vector chop_path(MutablePathMutableHandleGraph* graph, handle_t handle = graph->get_handle_of_step(current_step); offset += graph->get_length(handle); current_step = graph->get_next_step(current_step); +#ifdef debug + cerr << "deleting " << graph->get_id(handle) << endl; +#endif chopped_handles.push_back(handle); } } // cut the last bit - if (offset < original_path_length - 1) { + if (offset < original_path_length) { cut_to(original_path_length); } - assert (!subpaths.empty()); return chopped_handles; } diff --git a/tests/t/chop.t b/tests/t/chop.t new file mode 100644 index 0000000..7439e90 --- /dev/null +++ b/tests/t/chop.t @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +BASH_TAP_ROOT=./bash-tap +. ${BASH_TAP_ROOT}/bash-tap-bootstrap + +PATH=../bin:$PATH +PATH=../deps/hal:$PATH + +plan tests 16 + +vg convert -g chop/tiny-flat.gfa -p > tiny-flat.vg +printf "x\t0\t100\n" > all.bed +chop-vg-paths tiny-flat.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa +is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" + +rm -f all.bed chopped-all.gfa + +printf "x\t0\t1\n" > ends.bed +printf "x\t48\t50\n" >> ends.bed +chop-vg-paths tiny-flat.vg ends.bed > chopped-ends.vg +is "$(vg paths -Ev chopped-ends.vg)" "x[1:48] 47" "chopping ends gives subpath in the middle with correct length" +is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" + +rm -f ends.bed chopped-ends.vg + +printf "x\t20\t25\n" > bits.bed +printf "x\t1\t5\n" >> bits.bed +printf "x\t10\t20\n" >> bits.bed +printf "x\t40\t49\n" >> bits.bed +chop-vg-paths tiny-flat.vg bits.bed > chopped-bits.vg +vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths +is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" +is "$(grep 'x\[0:1\].1' bits.paths | wc -l)" "1" "first bit found" +is "$(grep 'x\[5:10\].5' bits.paths | wc -l)" "1" "next bit found" +is "$(grep 'x\[25:40\].15' bits.paths | wc -l)" "1" "next bit after found" +is "$(grep 'x\[49:50\].1' bits.paths | wc -l)" "1" "last bit found" + +rm -f bits.bed chopped-bits.vg bits.paths + +rm -f tiny-flat.vg + +########## flip path and repeat ########## + +#vg convert -g chop/tiny-rev.gfa -p > tiny-rev.vg +vg convert -g chop/tiny-rev.gfa -o > tiny-rev.vg +printf "x\t0\t100\n" > all.bed +chop-vg-paths tiny-rev.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa +is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" + +rm -f all.bed chopped-all.gfa + +printf "x\t0\t1\n" > ends.bed +printf "x\t48\t50\n" >> ends.bed +chop-vg-paths tiny-rev.vg ends.bed > chopped-ends.vg +is "$(vg paths -Ev chopped-ends.vg)" "x[1:48] 47" "chopping ends gives subpath in the middle with correct length" +is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" + +rm -f ends.bed chopped-ends.vg + +printf "x\t20\t25\n" > bits.bed +printf "x\t1\t5\n" >> bits.bed +printf "x\t10\t20\n" >> bits.bed +printf "x\t40\t49\n" >> bits.bed +chop-vg-paths tiny-rev.vg bits.bed > chopped-bits.vg +vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths +is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" +is "$(grep 'x\[0:1\].1' bits.paths | wc -l)" "1" "first bit found" +is "$(grep 'x\[5:10\].5' bits.paths | wc -l)" "1" "next bit found" +is "$(grep 'x\[25:40\].15' bits.paths | wc -l)" "1" "next bit after found" +is "$(grep 'x\[49:50\].1' bits.paths | wc -l)" "1" "last bit found" + +rm -f bits.bed chopped-bits.vg bits.paths + +rm -f tiny-rev.vg diff --git a/tests/t/small.t b/tests/t/small.t new file mode 100644 index 0000000..bcd1c21 --- /dev/null +++ b/tests/t/small.t @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +BASH_TAP_ROOT=./bash-tap +. ${BASH_TAP_ROOT}/bash-tap-bootstrap + +PATH=../bin:$PATH +PATH=../deps/hal:$PATH + +plan tests 2 + +maf2hal small/small.maf small.hal +hal2vg small.hal > small.vg +vg view -j small.vg | jq . > small.json + +is $(vg validate small.vg | wc -l) 0 "output vg validates" + +# jq craziness from https://stackoverflow.com/questions/31930041/using-jq-or-alternative-command-line-tools-to-compare-json-files +is $(jq --argfile a small.json --argfile b small/truth.json -n 'def post_recurse(f): def r: (f | select(. != null) | r), .; r; def post_recurse: post_recurse(.[]?); ($a | (post_recurse | arrays) |= sort) as $a | ($b | (post_recurse | arrays) |= sort) as $b | $a == $b') true "output graph identical to manually verified truth graph" + +rm -f small.vg small.json + +rm -f small.hal From 7fd7baef30a203f4279efd31d12c57ed71fe13e0 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Jan 2021 17:09:37 -0500 Subject: [PATCH 03/23] update libbdsg --- deps/libbdsg-easy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libbdsg-easy b/deps/libbdsg-easy index 3e618e9..1a14abb 160000 --- a/deps/libbdsg-easy +++ b/deps/libbdsg-easy @@ -1 +1 @@ -Subproject commit 3e618e9d8c97f8ca61f9d99964100618f05d7418 +Subproject commit 1a14abb2db4e8a4ce02be4e6d3e7aba5e8d4c1ac From b345db448c30f8a18b07d223616388b94999168d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Jan 2021 21:28:59 -0500 Subject: [PATCH 04/23] build before test --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index df604be..7a2a4ac 100644 --- a/Makefile +++ b/Makefile @@ -53,5 +53,6 @@ chop-vg-paths.o : chop-vg-paths.cpp ${basicLibsDependencies} chop-vg-paths : chop-vg-paths.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread chop-vg-paths.o ${basicLibs} -o chop-vg-paths -test : hal2vg +test : + make cd tests && prove -v t From 0883b9daf6a8bdf1053d4a38417b392f15bca26a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Jan 2021 21:39:36 -0500 Subject: [PATCH 05/23] update vg version used in travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index bdaa04c..242b8bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ before_install: install: - sudo pip3 install setuptools --upgrade - - wget https://github.com/vgteam/vg/releases/download/v1.24.0/vg && chmod u+x vg + - wget https://github.com/vgteam/vg/releases/download/v1.30.0/vg && chmod u+x vg script: - export PATH=$(pwd):$PATH From 1a1bf37987c487a54b51404fe142d1f5bdbcec8e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 07:28:30 -0500 Subject: [PATCH 06/23] commit test data --- tests/chop/tiny-flat.gfa | 5 +++++ tests/chop/tiny-rev.gfa | 5 +++++ tests/small.t | 22 ---------------------- 3 files changed, 10 insertions(+), 22 deletions(-) create mode 100644 tests/chop/tiny-flat.gfa create mode 100644 tests/chop/tiny-rev.gfa delete mode 100644 tests/small.t diff --git a/tests/chop/tiny-flat.gfa b/tests/chop/tiny-flat.gfa new file mode 100644 index 0000000..c24148c --- /dev/null +++ b/tests/chop/tiny-flat.gfa @@ -0,0 +1,5 @@ +H VN:Z:1.0 +S 1 CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA +S 2 TTATATTCCAACTCTCTG +P x 1+,2+ * +L 1 + 2 + * diff --git a/tests/chop/tiny-rev.gfa b/tests/chop/tiny-rev.gfa new file mode 100644 index 0000000..6f0cd07 --- /dev/null +++ b/tests/chop/tiny-rev.gfa @@ -0,0 +1,5 @@ +H VN:Z:1.0 +S 1 CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA +S 2 TTATATTCCAACTCTCTG +P x 2-,1- * +L 1 + 2 + * diff --git a/tests/small.t b/tests/small.t deleted file mode 100644 index bcd1c21..0000000 --- a/tests/small.t +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -BASH_TAP_ROOT=./bash-tap -. ${BASH_TAP_ROOT}/bash-tap-bootstrap - -PATH=../bin:$PATH -PATH=../deps/hal:$PATH - -plan tests 2 - -maf2hal small/small.maf small.hal -hal2vg small.hal > small.vg -vg view -j small.vg | jq . > small.json - -is $(vg validate small.vg | wc -l) 0 "output vg validates" - -# jq craziness from https://stackoverflow.com/questions/31930041/using-jq-or-alternative-command-line-tools-to-compare-json-files -is $(jq --argfile a small.json --argfile b small/truth.json -n 'def post_recurse(f): def r: (f | select(. != null) | r), .; r; def post_recurse: post_recurse(.[]?); ($a | (post_recurse | arrays) |= sort) as $a | ($b | (post_recurse | arrays) |= sort) as $b | $a == $b') true "output graph identical to manually verified truth graph" - -rm -f small.vg small.json - -rm -f small.hal From 8016eb4f3100d13c31853a9824fb6c993c96204d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 08:39:00 -0500 Subject: [PATCH 07/23] rebrand as clip-vg --- Makefile | 14 +++++++------- chop-vg-paths.cpp => clip-vg.cpp | 5 +++++ tests/t/chop.t | 15 ++++++++------- 3 files changed, 20 insertions(+), 14 deletions(-) rename chop-vg-paths.cpp => clip-vg.cpp (98%) diff --git a/Makefile b/Makefile index 7a2a4ac..17614c3 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ rootPath = ./ include ./include.mk -all : hal2vg chop-vg-paths +all : hal2vg clip-vg # Note: hdf5 from apt doesn't seem to work for static builds. It should be installed # from source and configured with "--enable-static --disable-shared", then have its @@ -20,10 +20,10 @@ else endif cleanFast : - rm -f hal2vg hal2vg.o chop-vg-paths chop-vg-paths.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o clean : - rm -f hal2vg hal2vg.o chop-vg-paths chop-vg-paths.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o cd deps/sonLib && make clean cd deps/pinchesAndCacti && make clean cd deps/hal && make clean @@ -47,11 +47,11 @@ ${libbdsgPath}/lib/libbdsg.a : hal2vg : hal2vg.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread hal2vg.o ${basicLibs} -o hal2vg -chop-vg-paths.o : chop-vg-paths.cpp ${basicLibsDependencies} - ${cpp} ${CXXFLAGS} -I . chop-vg-paths.cpp -c +clip-vg.o : clip-vg.cpp ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -I . clip-vg.cpp -c -chop-vg-paths : chop-vg-paths.o ${basicLibsDependencies} - ${cpp} ${CXXFLAGS} -fopenmp -pthread chop-vg-paths.o ${basicLibs} -o chop-vg-paths +clip-vg : clip-vg.o ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -fopenmp -pthread clip-vg.o ${basicLibs} -o clip-vg test : make diff --git a/chop-vg-paths.cpp b/clip-vg.cpp similarity index 98% rename from chop-vg-paths.cpp rename to clip-vg.cpp index 3f59853..f624605 100644 --- a/chop-vg-paths.cpp +++ b/clip-vg.cpp @@ -163,6 +163,11 @@ unique_ptr load_graph(istream& graph_stream) { graph_stream.clear(); graph_stream.seekg(0, ios::beg); + if (magic_number != ODGI().get_magic_number()) { + cerr << "Only ODGI supported until this bug is fixed: https://github.com/vgteam/libbdsg/issues/94" << endl; + exit(1); + } + MutablePathMutableHandleGraph* graph; if (magic_number == PackedGraph().get_magic_number()) { graph = new PackedGraph(); diff --git a/tests/t/chop.t b/tests/t/chop.t index 7439e90..91aae08 100644 --- a/tests/t/chop.t +++ b/tests/t/chop.t @@ -8,16 +8,17 @@ PATH=../deps/hal:$PATH plan tests 16 -vg convert -g chop/tiny-flat.gfa -p > tiny-flat.vg +#vg convert -g chop/tiny-flat.gfa -p > tiny-flat.vg +vg convert -g chop/tiny-flat.gfa -o > tiny-flat.vg printf "x\t0\t100\n" > all.bed -chop-vg-paths tiny-flat.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa +clip-vg tiny-flat.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" rm -f all.bed chopped-all.gfa printf "x\t0\t1\n" > ends.bed printf "x\t48\t50\n" >> ends.bed -chop-vg-paths tiny-flat.vg ends.bed > chopped-ends.vg +clip-vg tiny-flat.vg ends.bed > chopped-ends.vg is "$(vg paths -Ev chopped-ends.vg)" "x[1:48] 47" "chopping ends gives subpath in the middle with correct length" is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" @@ -27,7 +28,7 @@ printf "x\t20\t25\n" > bits.bed printf "x\t1\t5\n" >> bits.bed printf "x\t10\t20\n" >> bits.bed printf "x\t40\t49\n" >> bits.bed -chop-vg-paths tiny-flat.vg bits.bed > chopped-bits.vg +clip-vg tiny-flat.vg bits.bed > chopped-bits.vg vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" is "$(grep 'x\[0:1\].1' bits.paths | wc -l)" "1" "first bit found" @@ -44,14 +45,14 @@ rm -f tiny-flat.vg #vg convert -g chop/tiny-rev.gfa -p > tiny-rev.vg vg convert -g chop/tiny-rev.gfa -o > tiny-rev.vg printf "x\t0\t100\n" > all.bed -chop-vg-paths tiny-rev.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa +clip-vg tiny-rev.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" rm -f all.bed chopped-all.gfa printf "x\t0\t1\n" > ends.bed printf "x\t48\t50\n" >> ends.bed -chop-vg-paths tiny-rev.vg ends.bed > chopped-ends.vg +clip-vg tiny-rev.vg ends.bed > chopped-ends.vg is "$(vg paths -Ev chopped-ends.vg)" "x[1:48] 47" "chopping ends gives subpath in the middle with correct length" is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" @@ -61,7 +62,7 @@ printf "x\t20\t25\n" > bits.bed printf "x\t1\t5\n" >> bits.bed printf "x\t10\t20\n" >> bits.bed printf "x\t40\t49\n" >> bits.bed -chop-vg-paths tiny-rev.vg bits.bed > chopped-bits.vg +clip-vg tiny-rev.vg bits.bed > chopped-bits.vg vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" is "$(grep 'x\[0:1\].1' bits.paths | wc -l)" "1" "first bit found" From fa9a930d282eefa31b34cc49a18a65a2a425090e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 08:50:11 -0500 Subject: [PATCH 08/23] add chop-vg --- build-tools/makeBinRelease | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build-tools/makeBinRelease b/build-tools/makeBinRelease index 4ef399a..846f722 100755 --- a/build-tools/makeBinRelease +++ b/build-tools/makeBinRelease @@ -36,4 +36,5 @@ else make check-static fi -cp hal2vg ${buildDir}/ +cp hal2vg chop-vg ${buildDir}/ + From b66ed7e3144d1ac7da62f17aaa8570fda4b22923 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 10:27:44 -0500 Subject: [PATCH 09/23] print stats --- clip-vg.cpp | 17 +++++++++++++++++ tests/t/chop.t | 10 +++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index f624605..f75634a 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -204,6 +204,12 @@ vector &split_delims(const string &s, const string& delims, vector>>& bed_intervals) { + + // keep some stats to print + size_t chopped_paths = 0; + size_t chopped_nodes = 0; + size_t chopped_bases = 0; + // careful not to iterate and chop, as we could hit new subpaths made vector path_handles; graph->for_each_path_handle([&](path_handle_t path_handle) { @@ -213,17 +219,28 @@ void chop_path_intervals(MutablePathMutableHandleGraph* graph, for (auto path_handle : path_handles) { string path_name = graph->get_path_name(path_handle); auto it = bed_intervals.find(path_name); + bool was_chopped = false; if (it != bed_intervals.end()) { vector chopped_handles = chop_path(graph, path_handle, it->second); if (!chopped_handles.empty()) { graph->destroy_path(path_handle); for (handle_t handle : chopped_handles) { assert(graph->steps_of_handle(handle).empty()); + chopped_bases += graph->get_length(handle); + was_chopped = true; + ++chopped_nodes; dynamic_cast(graph)->destroy_handle(handle); } } } + if (was_chopped) { + ++chopped_paths; + } } + cerr << "[clip-vg] : Clipped " + << chopped_bases << " bases from " + << chopped_nodes << " nodes in " + << chopped_paths << " paths" << endl; } vector chop_path(MutablePathMutableHandleGraph* graph, diff --git a/tests/t/chop.t b/tests/t/chop.t index 91aae08..b7bec38 100644 --- a/tests/t/chop.t +++ b/tests/t/chop.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=./bash-tap PATH=../bin:$PATH PATH=../deps/hal:$PATH -plan tests 16 +plan tests 17 #vg convert -g chop/tiny-flat.gfa -p > tiny-flat.vg vg convert -g chop/tiny-flat.gfa -o > tiny-flat.vg @@ -16,6 +16,14 @@ is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" rm -f all.bed chopped-all.gfa +printf "y\t0\t100\n" > none.bed +clip-vg tiny-flat.vg none.bed | vg view - | grep -v ^H > chopped-none.gfa +vg view tiny-flat.vg | grep -v ^H > orig.gfa +diff chopped-none.gfa orig.gfa +is "$?" 0 "chopping nothing doesn't change graph" + +rm -f none.bed chopped-none.gfa orig.gfa + printf "x\t0\t1\n" > ends.bed printf "x\t48\t50\n" >> ends.bed clip-vg tiny-flat.vg ends.bed > chopped-ends.vg From 53c064e83e7abf1c8ef81ae1a563eeb4be52be5d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 12:56:43 -0500 Subject: [PATCH 10/23] add divide-handle assert --- clip-vg.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index f75634a..1556489 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -284,14 +284,17 @@ vector chop_path(MutablePathMutableHandleGraph* graph, cerr << endl; } #endif -#ifdef debug + + size_t total_pieces_length = 0; vector pieces = graph->divide_handle(handle, cut_points) ; for (auto piece : pieces) { + total_pieces_length += graph->get_length(piece); +#ifdef debug cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) << endl; - } -#else - graph->divide_handle(handle, cut_points); #endif + } + // bugs in divide-handle turning out to be a real issue. add this sanity check to catch them early. + assert(total_pieces_length == len); offset += len; } From e184f37bd2b90dbe15af0e8461659f036b5eb6ad Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 13:09:34 -0500 Subject: [PATCH 11/23] progress option --- clip-vg.cpp | 53 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 1556489..36a1013 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -25,6 +25,9 @@ using namespace bdsg; void help(char** argv) { cerr << "usage: " << argv[0] << " [options] " << endl << "Chop out path intervals from a vg graph" << endl + << endl + << "options: " << endl + << " -p, --progress Print progress" << endl << endl; } @@ -32,7 +35,8 @@ static unordered_map>> load_bed(istream& b static unique_ptr load_graph(istream& graph_stream); static vector &split_delims(const string &s, const string& delims, vector &elems); static void chop_path_intervals(MutablePathMutableHandleGraph* graph, - const unordered_map>>& bed_intervals); + const unordered_map>>& bed_intervals, + bool progress = false); static vector chop_path(MutablePathMutableHandleGraph* graph, path_handle_t path_handle, const vector>& intervals); @@ -42,19 +46,21 @@ static inline string make_subpath_name(const string& path_name, size_t offset, s } int main(int argc, char** argv) { - + + bool progress = false; int c; optind = 1; while (true) { static const struct option long_options[] = { {"help", no_argument, 0, 'h'}, + {"progress", no_argument, 0, 'p'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "h", + c = getopt_long (argc, argv, "hp", long_options, &option_index); // Detect the end of the options. @@ -63,6 +69,9 @@ int main(int argc, char** argv) { switch (c) { + case 'p': + progress = true; + break; case 'h': case '?': /* getopt_long already printed an error message. */ @@ -81,13 +90,13 @@ int main(int argc, char** argv) { // Parse the positional argument if (optind >= argc) { - cerr << "[chop-vg-paths] error: too few arguments" << endl; + cerr << "[clip-vg] error: too few arguments" << endl; help(argv); return 1; } if (optind != argc - 2) { - cerr << "[chop-vg-paths] error: too many arguments" << endl; + cerr << "[clip-vg] error: too many arguments" << endl; help(argv); return 1; } @@ -97,21 +106,31 @@ int main(int argc, char** argv) { ifstream bed_stream(bed_path); if (!bed_stream) { - cerr << "[chop-vg-paths] error: Unable to open input BED file " << bed_path << endl; + cerr << "[clip-vg] error: Unable to open input BED file " << bed_path << endl; return 1; } unordered_map>> bed_intervals = load_bed(bed_stream); bed_stream.close(); + if (progress) { + size_t num_intervals = 0; + for (auto& bi : bed_intervals) { + num_intervals += bi.second.size(); + } + cerr << "[clip-vg]: Loaded " << num_intervals << " BED intervals over " << bed_intervals.size() << " sequences" << endl; + } ifstream graph_stream(graph_path); if (!graph_stream) { - cerr << "[chop-vg-paths] error: Unable to open input graph " << graph_path << endl; + cerr << "[clip-vg] error: Unable to open input graph " << graph_path << endl; return 1; } unique_ptr graph = load_graph(graph_stream); graph_stream.close(); - - chop_path_intervals(graph.get(), bed_intervals); + if (progress) { + cerr << "[clip-vg]: Loaded graph" << endl; + } + + chop_path_intervals(graph.get(), bed_intervals, progress); dynamic_cast(graph.get())->serialize(cout); @@ -203,7 +222,8 @@ vector &split_delims(const string &s, const string& delims, vector>>& bed_intervals) { + const unordered_map>>& bed_intervals, + bool progress) { // keep some stats to print size_t chopped_paths = 0; @@ -221,6 +241,9 @@ void chop_path_intervals(MutablePathMutableHandleGraph* graph, auto it = bed_intervals.find(path_name); bool was_chopped = false; if (it != bed_intervals.end()) { + if (progress) { + cerr << "[clip-vg]: Clipping " << it->second.size() << " intervals from path " << path_name << endl; + } vector chopped_handles = chop_path(graph, path_handle, it->second); if (!chopped_handles.empty()) { graph->destroy_path(path_handle); @@ -237,10 +260,12 @@ void chop_path_intervals(MutablePathMutableHandleGraph* graph, ++chopped_paths; } } - cerr << "[clip-vg] : Clipped " - << chopped_bases << " bases from " - << chopped_nodes << " nodes in " - << chopped_paths << " paths" << endl; + if (progress) { + cerr << "[clip-vg]: Clipped " + << chopped_bases << " bases from " + << chopped_nodes << " nodes in " + << chopped_paths << " paths" << endl; + } } vector chop_path(MutablePathMutableHandleGraph* graph, From 63daa964ca577e768f4e3611381b4db26d025fb1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 13:18:38 -0500 Subject: [PATCH 12/23] debug msg --- clip-vg.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 36a1013..d716954 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -315,7 +315,8 @@ vector chop_path(MutablePathMutableHandleGraph* graph, for (auto piece : pieces) { total_pieces_length += graph->get_length(piece); #ifdef debug - cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) << endl; + cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) + << " tlen=" << total_pieces_length << "/" << len << endl; #endif } // bugs in divide-handle turning out to be a real issue. add this sanity check to catch them early. From f84d352fbd5b306d9f5ed66cd1d0c4102fbbd362 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 13:25:07 -0500 Subject: [PATCH 13/23] cleaner debug message --- clip-vg.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index d716954..e7af7b1 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -299,28 +299,27 @@ vector chop_path(MutablePathMutableHandleGraph* graph, } } // chop the node -#ifdef debug if (!cut_points.empty()) { +#ifdef debug cerr << "dividing node_id=" << graph->get_id(handle) << ":" << graph->get_is_reverse(handle) << " seq=" << graph->get_sequence(handle) << " for path " << graph->get_path_name(path_handle) << " at cut points:"; for (auto cp : cut_points) { cerr << " " << cp; } cerr << endl; - } -#endif - - size_t total_pieces_length = 0; - vector pieces = graph->divide_handle(handle, cut_points) ; - for (auto piece : pieces) { - total_pieces_length += graph->get_length(piece); +#endif + size_t total_pieces_length = 0; + vector pieces = graph->divide_handle(handle, cut_points) ; + for (auto piece : pieces) { + total_pieces_length += graph->get_length(piece); #ifdef debug - cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) - << " tlen=" << total_pieces_length << "/" << len << endl; + cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) + << " tlen=" << total_pieces_length << "/" << len << endl; #endif + } + // bugs in divide-handle turning out to be a real issue. add this sanity check to catch them early. + assert(total_pieces_length == len); } - // bugs in divide-handle turning out to be a real issue. add this sanity check to catch them early. - assert(total_pieces_length == len); offset += len; } From 98b4fc68c9f1e09b9d146292c5e022aeb89af4d6 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 13:39:40 -0500 Subject: [PATCH 14/23] more asserts --- clip-vg.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index e7af7b1..3cd6820 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -310,8 +310,15 @@ vector chop_path(MutablePathMutableHandleGraph* graph, #endif size_t total_pieces_length = 0; vector pieces = graph->divide_handle(handle, cut_points) ; - for (auto piece : pieces) { - total_pieces_length += graph->get_length(piece); + for (size_t i = 0; i < pieces.size(); ++i) { + handle_t& piece = pieces[i]; + size_t piece_length = graph->get_length(piece); + if (i == 0) { + assert(piece_length == cut_points[0]); + } else if (i < pieces.size() - 1) { + assert(piece_length == cut_points[i] - cut_points[i-1]); + } + total_pieces_length += piece_length; #ifdef debug cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) << " tlen=" << total_pieces_length << "/" << len << endl; From 848a8610b41876646242eae286b2b195530cd431 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 14:04:19 -0500 Subject: [PATCH 15/23] dont delete twice in cycle --- clip-vg.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 3cd6820..664e702 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -37,9 +37,9 @@ static vector &split_delims(const string &s, const string& delims, vecto static void chop_path_intervals(MutablePathMutableHandleGraph* graph, const unordered_map>>& bed_intervals, bool progress = false); -static vector chop_path(MutablePathMutableHandleGraph* graph, - path_handle_t path_handle, - const vector>& intervals); +static unordered_set chop_path(MutablePathMutableHandleGraph* graph, + path_handle_t path_handle, + const vector>& intervals); // Create a subpath name (todo: make same function in vg consistent (it only includes start)) static inline string make_subpath_name(const string& path_name, size_t offset, size_t length) { return path_name + "[" + std::to_string(offset) + ":" + std::to_string(offset + length) + "]"; @@ -244,7 +244,7 @@ void chop_path_intervals(MutablePathMutableHandleGraph* graph, if (progress) { cerr << "[clip-vg]: Clipping " << it->second.size() << " intervals from path " << path_name << endl; } - vector chopped_handles = chop_path(graph, path_handle, it->second); + auto chopped_handles = chop_path(graph, path_handle, it->second); if (!chopped_handles.empty()) { graph->destroy_path(path_handle); for (handle_t handle : chopped_handles) { @@ -268,9 +268,9 @@ void chop_path_intervals(MutablePathMutableHandleGraph* graph, } } -vector chop_path(MutablePathMutableHandleGraph* graph, - path_handle_t path_handle, - const vector>& intervals) { +unordered_set chop_path(MutablePathMutableHandleGraph* graph, + path_handle_t path_handle, + const vector>& intervals) { // get the breakpoints set breakpoints; @@ -332,7 +332,7 @@ vector chop_path(MutablePathMutableHandleGraph* graph, steps.clear(); int64_t original_path_length = offset; - vector chopped_handles; + unordered_set chopped_handles; offset = 0; step_handle_t current_step = graph->path_begin(path_handle); #ifdef debug @@ -373,6 +373,7 @@ vector chop_path(MutablePathMutableHandleGraph* graph, } }; + for (size_t i = 0; i < intervals.size(); ++i) { if (intervals[i].first > offset) { // cut everythign left of the interval @@ -386,7 +387,7 @@ vector chop_path(MutablePathMutableHandleGraph* graph, #ifdef debug cerr << "deleting " << graph->get_id(handle) << endl; #endif - chopped_handles.push_back(handle); + chopped_handles.insert(handle); } } From 3c2fa05410246144dc220a0048ab639b1e9c377e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 14:15:19 -0500 Subject: [PATCH 16/23] remove odgi-only message as it doesnt work either --- clip-vg.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 664e702..3d27172 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -182,11 +182,6 @@ unique_ptr load_graph(istream& graph_stream) { graph_stream.clear(); graph_stream.seekg(0, ios::beg); - if (magic_number != ODGI().get_magic_number()) { - cerr << "Only ODGI supported until this bug is fixed: https://github.com/vgteam/libbdsg/issues/94" << endl; - exit(1); - } - MutablePathMutableHandleGraph* graph; if (magic_number == PackedGraph().get_magic_number()) { graph = new PackedGraph(); From 511d5fd908be372d02c261127393e55bf1817562 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Jan 2021 14:58:16 -0500 Subject: [PATCH 17/23] update libbdsg-easy --- deps/libbdsg-easy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libbdsg-easy b/deps/libbdsg-easy index 1a14abb..fbd5f11 160000 --- a/deps/libbdsg-easy +++ b/deps/libbdsg-easy @@ -1 +1 @@ -Subproject commit 1a14abb2db4e8a4ce02be4e6d3e7aba5e8d4c1ac +Subproject commit fbd5f111bb2437099cd3deeeb0a138c9e9d0fd96 From d9ee85e8baee9662279a7f1f06fd70caceb2a304 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Jan 2021 16:04:05 -0500 Subject: [PATCH 18/23] update libbdsg --- deps/libbdsg-easy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libbdsg-easy b/deps/libbdsg-easy index fbd5f11..3d16a8b 160000 --- a/deps/libbdsg-easy +++ b/deps/libbdsg-easy @@ -1 +1 @@ -Subproject commit fbd5f111bb2437099cd3deeeb0a138c9e9d0fd96 +Subproject commit 3d16a8ba5002e2e2e158877b0919dbde871bb067 From 59601afd23b86fce9e1206163c08756df05c2997 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Jan 2021 16:19:17 -0500 Subject: [PATCH 19/23] take : out of subpath name as it messes up vcf --- clip-vg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 3d27172..f107e56 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -42,7 +42,7 @@ static unordered_set chop_path(MutablePathMutableHandleGraph* graph, const vector>& intervals); // Create a subpath name (todo: make same function in vg consistent (it only includes start)) static inline string make_subpath_name(const string& path_name, size_t offset, size_t length) { - return path_name + "[" + std::to_string(offset) + ":" + std::to_string(offset + length) + "]"; + return path_name + "[" + std::to_string(offset) + "-" + std::to_string(offset + length) + "]"; } int main(int argc, char** argv) { From cd76271749c7f9bcee74f37e6933a5fe2557dc2e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Jan 2021 16:30:47 -0500 Subject: [PATCH 20/23] update test for new char --- tests/t/chop.t | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/t/chop.t b/tests/t/chop.t index b7bec38..70d29db 100644 --- a/tests/t/chop.t +++ b/tests/t/chop.t @@ -27,7 +27,7 @@ rm -f none.bed chopped-none.gfa orig.gfa printf "x\t0\t1\n" > ends.bed printf "x\t48\t50\n" >> ends.bed clip-vg tiny-flat.vg ends.bed > chopped-ends.vg -is "$(vg paths -Ev chopped-ends.vg)" "x[1:48] 47" "chopping ends gives subpath in the middle with correct length" +is "$(vg paths -Ev chopped-ends.vg)" "x[1-48] 47" "chopping ends gives subpath in the middle with correct length" is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" rm -f ends.bed chopped-ends.vg @@ -39,10 +39,10 @@ printf "x\t40\t49\n" >> bits.bed clip-vg tiny-flat.vg bits.bed > chopped-bits.vg vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" -is "$(grep 'x\[0:1\].1' bits.paths | wc -l)" "1" "first bit found" -is "$(grep 'x\[5:10\].5' bits.paths | wc -l)" "1" "next bit found" -is "$(grep 'x\[25:40\].15' bits.paths | wc -l)" "1" "next bit after found" -is "$(grep 'x\[49:50\].1' bits.paths | wc -l)" "1" "last bit found" +is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found" +is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found" +is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found" +is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found" rm -f bits.bed chopped-bits.vg bits.paths @@ -61,7 +61,7 @@ rm -f all.bed chopped-all.gfa printf "x\t0\t1\n" > ends.bed printf "x\t48\t50\n" >> ends.bed clip-vg tiny-rev.vg ends.bed > chopped-ends.vg -is "$(vg paths -Ev chopped-ends.vg)" "x[1:48] 47" "chopping ends gives subpath in the middle with correct length" +is "$(vg paths -Ev chopped-ends.vg)" "x[1-48] 47" "chopping ends gives subpath in the middle with correct length" is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" rm -f ends.bed chopped-ends.vg @@ -73,10 +73,10 @@ printf "x\t40\t49\n" >> bits.bed clip-vg tiny-rev.vg bits.bed > chopped-bits.vg vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" -is "$(grep 'x\[0:1\].1' bits.paths | wc -l)" "1" "first bit found" -is "$(grep 'x\[5:10\].5' bits.paths | wc -l)" "1" "next bit found" -is "$(grep 'x\[25:40\].15' bits.paths | wc -l)" "1" "next bit after found" -is "$(grep 'x\[49:50\].1' bits.paths | wc -l)" "1" "last bit found" +is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found" +is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found" +is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found" +is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found" rm -f bits.bed chopped-bits.vg bits.paths From b874bb6536545d1e927812f4ff034591d1e06de0 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 28 Jan 2021 10:41:04 -0500 Subject: [PATCH 21/23] update libbdsg and use packed graph again in test --- deps/libbdsg-easy | 2 +- tests/t/chop.t | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deps/libbdsg-easy b/deps/libbdsg-easy index 3d16a8b..e37cf0e 160000 --- a/deps/libbdsg-easy +++ b/deps/libbdsg-easy @@ -1 +1 @@ -Subproject commit 3d16a8ba5002e2e2e158877b0919dbde871bb067 +Subproject commit e37cf0e1981abb8bb8a93e879d11e8b3b8a07fdd diff --git a/tests/t/chop.t b/tests/t/chop.t index 70d29db..2f84abd 100644 --- a/tests/t/chop.t +++ b/tests/t/chop.t @@ -50,8 +50,8 @@ rm -f tiny-flat.vg ########## flip path and repeat ########## -#vg convert -g chop/tiny-rev.gfa -p > tiny-rev.vg -vg convert -g chop/tiny-rev.gfa -o > tiny-rev.vg +vg convert -g chop/tiny-rev.gfa -p > tiny-rev.vg +#vg convert -g chop/tiny-rev.gfa -o > tiny-rev.vg printf "x\t0\t100\n" > all.bed clip-vg tiny-rev.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" From 41a46491d7051352daeac509895e8066fa551cf8 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 1 Feb 2021 15:38:58 -0500 Subject: [PATCH 22/23] rename --- build-tools/makeBinRelease | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-tools/makeBinRelease b/build-tools/makeBinRelease index 846f722..a29bf5c 100755 --- a/build-tools/makeBinRelease +++ b/build-tools/makeBinRelease @@ -36,5 +36,5 @@ else make check-static fi -cp hal2vg chop-vg ${buildDir}/ +cp hal2vg clip-vg ${buildDir}/ From 06be915d9210336f8f7dad8cdf9a6ee23b9d8e01 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 1 Feb 2021 15:53:22 -0500 Subject: [PATCH 23/23] check --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 17614c3..c282097 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,11 @@ ifeq ($(shell ldd hal2vg | grep "not a dynamic" | wc -l), $(shell ls hal2vg | wc else $(error ldd found dnymaic linked dependency in hal2vg) endif +ifeq ($(shell ldd clip-vg | grep "not a dynamic" | wc -l), $(shell ls clip-vg | wc -l)) + $(info ldd verified that clip-vg static) +else + $(error ldd found dnymaic linked dependency in clip-vg) +endif cleanFast : rm -f hal2vg hal2vg.o clip-vg clip-vg.o