diff --git a/.travis.yml b/.travis.yml index bdaa04c..242b8bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ before_install: install: - sudo pip3 install setuptools --upgrade - - wget https://github.com/vgteam/vg/releases/download/v1.24.0/vg && chmod u+x vg + - wget https://github.com/vgteam/vg/releases/download/v1.30.0/vg && chmod u+x vg script: - export PATH=$(pwd):$PATH diff --git a/Makefile b/Makefile index 82f486a..c282097 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ rootPath = ./ include ./include.mk -all : hal2vg +all : hal2vg clip-vg # Note: hdf5 from apt doesn't seem to work for static builds. It should be installed # from source and configured with "--enable-static --disable-shared", then have its @@ -18,12 +18,17 @@ ifeq ($(shell ldd hal2vg | grep "not a dynamic" | wc -l), $(shell ls hal2vg | wc else $(error ldd found dnymaic linked dependency in hal2vg) endif +ifeq ($(shell ldd clip-vg | grep "not a dynamic" | wc -l), $(shell ls clip-vg | wc -l)) + $(info ldd verified that clip-vg static) +else + $(error ldd found dnymaic linked dependency in clip-vg) +endif cleanFast : - rm -f hal2vg hal2vg.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o clean : - rm -f hal2vg hal2vg.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o cd deps/sonLib && make clean cd deps/pinchesAndCacti && make clean cd deps/hal && make clean @@ -47,5 +52,12 @@ ${libbdsgPath}/lib/libbdsg.a : hal2vg : hal2vg.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread hal2vg.o ${basicLibs} -o hal2vg -test : hal2vg - cd tests && prove -v small.t +clip-vg.o : clip-vg.cpp ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -I . clip-vg.cpp -c + +clip-vg : clip-vg.o ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -fopenmp -pthread clip-vg.o ${basicLibs} -o clip-vg + +test : + make + cd tests && prove -v t diff --git a/build-tools/makeBinRelease b/build-tools/makeBinRelease index 4ef399a..a29bf5c 100755 --- a/build-tools/makeBinRelease +++ b/build-tools/makeBinRelease @@ -36,4 +36,5 @@ else make check-static fi -cp hal2vg ${buildDir}/ +cp hal2vg clip-vg ${buildDir}/ + diff --git a/clip-vg.cpp b/clip-vg.cpp new file mode 100644 index 0000000..f107e56 --- /dev/null +++ b/clip-vg.cpp @@ -0,0 +1,395 @@ +// Chop regions (from BED File) out of paths in vg graphs, creating subpath names and cutting out nodes or parts of nodes +// Assumes that: +// - regions don't overlap (error otherwise) +// - regions are only ever part of at most one path each (assert false otherwise) + +//#define debug + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "bdsg/odgi.hpp" + +using namespace std; +using namespace handlegraph; +using namespace bdsg; + +void help(char** argv) { + cerr << "usage: " << argv[0] << " [options] " << endl + << "Chop out path intervals from a vg graph" << endl + << endl + << "options: " << endl + << " -p, --progress Print progress" << endl + << endl; +} + +static unordered_map>> load_bed(istream& bed_stream); +static unique_ptr load_graph(istream& graph_stream); +static vector &split_delims(const string &s, const string& delims, vector &elems); +static void chop_path_intervals(MutablePathMutableHandleGraph* graph, + const unordered_map>>& bed_intervals, + bool progress = false); +static unordered_set chop_path(MutablePathMutableHandleGraph* graph, + path_handle_t path_handle, + const vector>& intervals); +// Create a subpath name (todo: make same function in vg consistent (it only includes start)) +static inline string make_subpath_name(const string& path_name, size_t offset, size_t length) { + return path_name + "[" + std::to_string(offset) + "-" + std::to_string(offset + length) + "]"; +} + +int main(int argc, char** argv) { + + bool progress = false; + int c; + optind = 1; + while (true) { + + static const struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"progress", no_argument, 0, 'p'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "hp", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'p': + progress = true; + break; + case 'h': + case '?': + /* getopt_long already printed an error message. */ + help(argv); + exit(1); + break; + default: + abort (); + } + } + + if (argc <= 1) { + help(argv); + return 1; + } + + // Parse the positional argument + if (optind >= argc) { + cerr << "[clip-vg] error: too few arguments" << endl; + help(argv); + return 1; + } + + if (optind != argc - 2) { + cerr << "[clip-vg] error: too many arguments" << endl; + help(argv); + return 1; + } + + string graph_path = argv[optind++]; + string bed_path = argv[optind++]; + + ifstream bed_stream(bed_path); + if (!bed_stream) { + cerr << "[clip-vg] error: Unable to open input BED file " << bed_path << endl; + return 1; + } + unordered_map>> bed_intervals = load_bed(bed_stream); + bed_stream.close(); + if (progress) { + size_t num_intervals = 0; + for (auto& bi : bed_intervals) { + num_intervals += bi.second.size(); + } + cerr << "[clip-vg]: Loaded " << num_intervals << " BED intervals over " << bed_intervals.size() << " sequences" << endl; + } + + ifstream graph_stream(graph_path); + if (!graph_stream) { + cerr << "[clip-vg] error: Unable to open input graph " << graph_path << endl; + return 1; + } + unique_ptr graph = load_graph(graph_stream); + graph_stream.close(); + if (progress) { + cerr << "[clip-vg]: Loaded graph" << endl; + } + + chop_path_intervals(graph.get(), bed_intervals, progress); + + dynamic_cast(graph.get())->serialize(cout); + + return 0; +} + +unordered_map>> load_bed(istream& bed_stream) { + // load bed + unordered_map>> intervals; + string buffer; + while (getline(bed_stream, buffer)) { + vector toks; + split_delims(buffer, "\t\n", toks); + if (toks.size() >= 3) { + string& name = toks[0]; + int64_t start = stol(toks[1]); + int64_t end = stol(toks[2]); + intervals[name].push_back(make_pair(start, end)); + } + } + // verify bed + for (auto& seq_intervals : intervals) { + sort(seq_intervals.second.begin(), seq_intervals.second.end(), + [](const pair& b1, const pair& b2) { + return b1.first < b2.first || (b1.first == b2.first && b1.second < b2.second); + }); + for (size_t i = 1; i < seq_intervals.second.size(); ++i) { + if (seq_intervals.second[i].first < seq_intervals.second[i-1].second) { + cerr << "Overlapping bed intervals found:\n" + << " " << seq_intervals.first << "\t" + << seq_intervals.second[i-1].first << "\t" + << seq_intervals.second[i-1].second << endl + << " " << seq_intervals.first << "\t" + << seq_intervals.second[i].first << "\t" + << seq_intervals.second[i].second << endl + << "These are not supported. Please clean up (ex with bedools merge) first" << endl; + exit(1); + } + } + } + return intervals; +} + +unique_ptr load_graph(istream& graph_stream) { + + char magic_bytes[4]; + graph_stream.read(magic_bytes, 4); + uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes)); + graph_stream.clear(); + graph_stream.seekg(0, ios::beg); + + MutablePathMutableHandleGraph* graph; + if (magic_number == PackedGraph().get_magic_number()) { + graph = new PackedGraph(); + } else if (magic_number == HashGraph().get_magic_number()) { + graph = new HashGraph(); + } else if (magic_number == ODGI().get_magic_number()) { + graph = new ODGI(); + } else { + cerr << "Unable to parse input graph with magic number " << magic_number << endl; + exit(1); + } + dynamic_cast(graph)->deserialize(graph_stream); + + return unique_ptr(graph); +} + +vector &split_delims(const string &s, const string& delims, vector &elems) { + size_t start = string::npos; + for (size_t i = 0; i < s.size(); ++i) { + if (delims.find(s[i]) != string::npos) { + if (start != string::npos && i > start) { + elems.push_back(s.substr(start, i - start)); + } + start = string::npos; + } else if (start == string::npos) { + start = i; + } + } + if (start != string::npos && start < s.size()) { + elems.push_back(s.substr(start, s.size() - start)); + } + return elems; +} + +void chop_path_intervals(MutablePathMutableHandleGraph* graph, + const unordered_map>>& bed_intervals, + bool progress) { + + // keep some stats to print + size_t chopped_paths = 0; + size_t chopped_nodes = 0; + size_t chopped_bases = 0; + + // careful not to iterate and chop, as we could hit new subpaths made + vector path_handles; + graph->for_each_path_handle([&](path_handle_t path_handle) { + path_handles.push_back(path_handle); + }); + + for (auto path_handle : path_handles) { + string path_name = graph->get_path_name(path_handle); + auto it = bed_intervals.find(path_name); + bool was_chopped = false; + if (it != bed_intervals.end()) { + if (progress) { + cerr << "[clip-vg]: Clipping " << it->second.size() << " intervals from path " << path_name << endl; + } + auto chopped_handles = chop_path(graph, path_handle, it->second); + if (!chopped_handles.empty()) { + graph->destroy_path(path_handle); + for (handle_t handle : chopped_handles) { + assert(graph->steps_of_handle(handle).empty()); + chopped_bases += graph->get_length(handle); + was_chopped = true; + ++chopped_nodes; + dynamic_cast(graph)->destroy_handle(handle); + } + } + } + if (was_chopped) { + ++chopped_paths; + } + } + if (progress) { + cerr << "[clip-vg]: Clipped " + << chopped_bases << " bases from " + << chopped_nodes << " nodes in " + << chopped_paths << " paths" << endl; + } +} + +unordered_set chop_path(MutablePathMutableHandleGraph* graph, + path_handle_t path_handle, + const vector>& intervals) { + + // get the breakpoints + set breakpoints; + for (const pair& interval : intervals) { + breakpoints.insert(interval.first); + breakpoints.insert(interval.second); // we're cutting before offset, so the open coordinate is what we want + } + + // to be safe, don't cut and iterate at the same time, so load up steps here + vector steps; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + steps.push_back(graph->get_handle_of_step(step_handle)); + }); + + // cut the nodes to ensure breakpoints at node boundaries + int64_t offset = 0; + for (auto handle : steps) { + int64_t len = graph->get_length(handle); + // find breakpoints in node + vector cut_points; + for (auto i = breakpoints.lower_bound(offset); i != breakpoints.end() && *i - offset < len; ++i) { + int64_t cut_point = *i - offset; + // libbdsg is buggy and can't accept cutpoints on ends on reverse strand + if (cut_point > 0 && cut_point < len) { + cut_points.push_back(cut_point); + } + } + // chop the node + if (!cut_points.empty()) { +#ifdef debug + cerr << "dividing node_id=" << graph->get_id(handle) << ":" << graph->get_is_reverse(handle) << " seq=" << graph->get_sequence(handle) + << " for path " << graph->get_path_name(path_handle) << " at cut points:"; + for (auto cp : cut_points) { + cerr << " " << cp; + } + cerr << endl; +#endif + size_t total_pieces_length = 0; + vector pieces = graph->divide_handle(handle, cut_points) ; + for (size_t i = 0; i < pieces.size(); ++i) { + handle_t& piece = pieces[i]; + size_t piece_length = graph->get_length(piece); + if (i == 0) { + assert(piece_length == cut_points[0]); + } else if (i < pieces.size() - 1) { + assert(piece_length == cut_points[i] - cut_points[i-1]); + } + total_pieces_length += piece_length; +#ifdef debug + cerr << " piece " << graph->get_id(piece) << ":" << graph->get_is_reverse(piece) << " " << graph->get_sequence(piece) + << " tlen=" << total_pieces_length << "/" << len << endl; +#endif + } + // bugs in divide-handle turning out to be a real issue. add this sanity check to catch them early. + assert(total_pieces_length == len); + } + offset += len; + } + + steps.clear(); + int64_t original_path_length = offset; + unordered_set chopped_handles; + offset = 0; + step_handle_t current_step = graph->path_begin(path_handle); +#ifdef debug + cerr << "init step to " << graph->get_id(graph->get_handle_of_step(current_step)) << ":" << graph->get_is_reverse(graph->get_handle_of_step(current_step)) + << " seq=" <get_sequence(graph->get_handle_of_step(current_step)) << endl; +#endif + vector subpaths; + + // cut out a subpath and make a new path out of it + function cut_to = [&](int64_t end_offset) { +#ifdef debug + cerr << "\ncut_to " << end_offset << " where current offset is " << offset << endl; +#endif + vector steps; + int64_t start_offset = offset; + int64_t path_length = 0; + while (offset < end_offset && current_step != graph->path_end(path_handle)) { + handle_t handle = graph->get_handle_of_step(current_step); + steps.push_back(handle); +#ifdef debug + cerr << " pushing subpath step " << graph->get_id(handle) << " len=" << graph->get_length(handle) << endl; +#endif + offset += graph->get_length(handle); + current_step = graph->get_next_step(current_step); + path_length += graph->get_length(handle); + } +#ifdef debug + cerr << "start offset=" << start_offset << " path length=" << path_length << " end offset=" << end_offset << endl; +#endif + assert(start_offset + path_length == end_offset); + + if (path_length > 0) { + path_handle_t subpath_handle = graph->create_path_handle(make_subpath_name(graph->get_path_name(path_handle), start_offset, path_length)); + for (auto step : steps) { + graph->append_step(subpath_handle, step); + } + subpaths.push_back(subpath_handle); + } + }; + + + for (size_t i = 0; i < intervals.size(); ++i) { + if (intervals[i].first > offset) { + // cut everythign left of the interval + cut_to(intervals[i].first); + } + // scan past the interval + while (offset < intervals[i].second && current_step != graph->path_end(path_handle)) { + handle_t handle = graph->get_handle_of_step(current_step); + offset += graph->get_length(handle); + current_step = graph->get_next_step(current_step); +#ifdef debug + cerr << "deleting " << graph->get_id(handle) << endl; +#endif + chopped_handles.insert(handle); + } + } + + // cut the last bit + if (offset < original_path_length) { + cut_to(original_path_length); + } + + return chopped_handles; +} diff --git a/deps/libbdsg-easy b/deps/libbdsg-easy index 3e618e9..e37cf0e 160000 --- a/deps/libbdsg-easy +++ b/deps/libbdsg-easy @@ -1 +1 @@ -Subproject commit 3e618e9d8c97f8ca61f9d99964100618f05d7418 +Subproject commit e37cf0e1981abb8bb8a93e879d11e8b3b8a07fdd diff --git a/tests/chop/tiny-flat.gfa b/tests/chop/tiny-flat.gfa new file mode 100644 index 0000000..c24148c --- /dev/null +++ b/tests/chop/tiny-flat.gfa @@ -0,0 +1,5 @@ +H VN:Z:1.0 +S 1 CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA +S 2 TTATATTCCAACTCTCTG +P x 1+,2+ * +L 1 + 2 + * diff --git a/tests/chop/tiny-rev.gfa b/tests/chop/tiny-rev.gfa new file mode 100644 index 0000000..6f0cd07 --- /dev/null +++ b/tests/chop/tiny-rev.gfa @@ -0,0 +1,5 @@ +H VN:Z:1.0 +S 1 CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA +S 2 TTATATTCCAACTCTCTG +P x 2-,1- * +L 1 + 2 + * diff --git a/tests/t/chop.t b/tests/t/chop.t new file mode 100644 index 0000000..2f84abd --- /dev/null +++ b/tests/t/chop.t @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +BASH_TAP_ROOT=./bash-tap +. ${BASH_TAP_ROOT}/bash-tap-bootstrap + +PATH=../bin:$PATH +PATH=../deps/hal:$PATH + +plan tests 17 + +#vg convert -g chop/tiny-flat.gfa -p > tiny-flat.vg +vg convert -g chop/tiny-flat.gfa -o > tiny-flat.vg +printf "x\t0\t100\n" > all.bed +clip-vg tiny-flat.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa +is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" + +rm -f all.bed chopped-all.gfa + +printf "y\t0\t100\n" > none.bed +clip-vg tiny-flat.vg none.bed | vg view - | grep -v ^H > chopped-none.gfa +vg view tiny-flat.vg | grep -v ^H > orig.gfa +diff chopped-none.gfa orig.gfa +is "$?" 0 "chopping nothing doesn't change graph" + +rm -f none.bed chopped-none.gfa orig.gfa + +printf "x\t0\t1\n" > ends.bed +printf "x\t48\t50\n" >> ends.bed +clip-vg tiny-flat.vg ends.bed > chopped-ends.vg +is "$(vg paths -Ev chopped-ends.vg)" "x[1-48] 47" "chopping ends gives subpath in the middle with correct length" +is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" + +rm -f ends.bed chopped-ends.vg + +printf "x\t20\t25\n" > bits.bed +printf "x\t1\t5\n" >> bits.bed +printf "x\t10\t20\n" >> bits.bed +printf "x\t40\t49\n" >> bits.bed +clip-vg tiny-flat.vg bits.bed > chopped-bits.vg +vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths +is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" +is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found" +is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found" +is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found" +is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found" + +rm -f bits.bed chopped-bits.vg bits.paths + +rm -f tiny-flat.vg + +########## flip path and repeat ########## + +vg convert -g chop/tiny-rev.gfa -p > tiny-rev.vg +#vg convert -g chop/tiny-rev.gfa -o > tiny-rev.vg +printf "x\t0\t100\n" > all.bed +clip-vg tiny-rev.vg all.bed | vg view - | grep -v ^H > chopped-all.gfa +is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" + +rm -f all.bed chopped-all.gfa + +printf "x\t0\t1\n" > ends.bed +printf "x\t48\t50\n" >> ends.bed +clip-vg tiny-rev.vg ends.bed > chopped-ends.vg +is "$(vg paths -Ev chopped-ends.vg)" "x[1-48] 47" "chopping ends gives subpath in the middle with correct length" +is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" + +rm -f ends.bed chopped-ends.vg + +printf "x\t20\t25\n" > bits.bed +printf "x\t1\t5\n" >> bits.bed +printf "x\t10\t20\n" >> bits.bed +printf "x\t40\t49\n" >> bits.bed +clip-vg tiny-rev.vg bits.bed > chopped-bits.vg +vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths +is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" +is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found" +is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found" +is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found" +is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found" + +rm -f bits.bed chopped-bits.vg bits.paths + +rm -f tiny-rev.vg diff --git a/tests/small.t b/tests/t/small.t similarity index 100% rename from tests/small.t rename to tests/t/small.t