From dd58c89449d03e6766f4878db3d3e4ebae3c7398 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Sat, 12 Oct 2019 14:12:42 -0700 Subject: [PATCH 01/79] Take two minimizers if only one seed --- src/minimizer_mapper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index d1b87cf549b..fbbc89437b0 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -109,7 +109,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { // of the selected minimizers is not high enough. size_t hits = minimizer_index.count(minimizers[minimizer_num]); - if (seeds.size() < 2 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) { + if (seeds.size() == 1 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) { // Locate the hits. for (auto& hit : minimizer_index.find(minimizers[minimizer_num])) { // Reverse the hits for a reverse minimizer @@ -317,7 +317,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { } if (read_coverage_by_cluster[cluster_num] == curr_coverage && cluster_score[cluster_num] == curr_score && - curr_kept < max_extensions / 2) { + curr_kept < max_extensions * 0.75) { curr_kept++; curr_count++; } else if (!read_coverage_by_cluster[cluster_num] == curr_coverage || From 4bffbd1cf393f20ff0ca75b9a2c95e21c5b277bb Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Sat, 12 Oct 2019 17:24:20 -0700 Subject: [PATCH 02/79] Cut off at hard hit cap --- src/minimizer_mapper.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index fbbc89437b0..8cc2ac2f0ec 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -109,14 +109,19 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { // of the selected minimizers is not high enough. size_t hits = minimizer_index.count(minimizers[minimizer_num]); - if (seeds.size() == 1 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) { + if (seeds.size() <= 1 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) { // Locate the hits. + size_t added_hits = 0; for (auto& hit : minimizer_index.find(minimizers[minimizer_num])) { // Reverse the hits for a reverse minimizer if (minimizers[minimizer_num].is_reverse) { size_t node_length = gbwt_graph.get_length(gbwt_graph.get_handle(id(hit))); hit = reverse_base_pos(hit, node_length); } + if (added_hits > hard_hit_cap) { + //Take only up to hard_hit_cap + continue; + } // For each position, remember it and what minimizer it came from seeds.push_back(hit); seed_to_source.push_back(minimizer_num); From c4b150e1fba28f941059797067761f8e752b46f4 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Thu, 17 Oct 2019 14:58:25 -0700 Subject: [PATCH 03/79] Changed default parameters --- scripts/giraffe-wrangler.sh | 2 +- src/subcommand/gaffe_main.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh index 9bce06a9bad..5dfaedb3083 100755 --- a/scripts/giraffe-wrangler.sh +++ b/scripts/giraffe-wrangler.sh @@ -91,7 +91,7 @@ echo "${SIM_GAM}" echo "${REAL_FASTQ}" # Define the Giraffe parameters -GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 300 -a 4 -s 50 -u 0.3 -v 1 -w 20) +GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 150 -a 4 -s 50 -u 0.4 -v 1 -w 20) # Define a work directory # TODO: this requires GNU mptemp diff --git a/src/subcommand/gaffe_main.cpp b/src/subcommand/gaffe_main.cpp index 8269e6233a3..fd882eaedba 100644 --- a/src/subcommand/gaffe_main.cpp +++ b/src/subcommand/gaffe_main.cpp @@ -358,13 +358,13 @@ int main_gaffe(int argc, char** argv) { // How many mappings per read can we emit? Range max_multimaps = 1; // How many clusters should we extend? - Range max_extensions = 300; + Range max_extensions = 150; // How many extended clusters should we align, max? Range max_alignments = 4; //Throw away cluster with scores that are this amount below the best Range cluster_score = 50; //Throw away clusters with coverage this amount below the best - Range cluster_coverage = 0.3; + Range cluster_coverage = 0.4; //Throw away extension sets with scores that are this amount below the best Range extension_set = 20; //Throw away extensions with scores that are this amount below the best From 7c42cd2bf3305abb14f4d3bfd9ef71487667d685 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Mon, 28 Oct 2019 09:25:53 -0700 Subject: [PATCH 04/79] Started paired end clusterer --- src/minimizer_mapper.cpp | 3 +-- src/seed_clusterer.cpp | 34 +++++++++++++++++++++------------ src/seed_clusterer.hpp | 41 ++++++++++++++++++++++++++++------------ 3 files changed, 52 insertions(+), 26 deletions(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index 7cfe7bd0ce6..5a028a137d1 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -187,8 +187,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { } // Cluster the seeds. Get sets of input seed indexes that go together. - tuple>,vector>> paired_clusters = clusterer.cluster_seeds(seeds, distance_limit); - vector> clusters = std::move(std::get<0>(paired_clusters)); + vector> clusters paired_clusters = clusterer.cluster_seeds(seeds, distance_limit); if (track_provenance) { funnel.substage("score"); diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index 836288ee745..c05d75efbd7 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -10,8 +10,16 @@ namespace vg { dist_index(dist_index){ }; - tuple>,vector>> SnarlSeedClusterer::cluster_seeds ( - vector seeds, int64_t read_distance_limit, + vector> cluster_seeds (vector seeds, int64_t read_distance_limit) const { + vector> all_seeds; + all_seeds.push_back(std::move(seeds)); + tuple>>,vector>> all_clusters = + cluster_seeds(all_seeds, distance_limit); + return std::get<0>(all_clusters)[0]; + }; + + tuple>>,vector>> SnarlSeedClusterer::cluster_seeds ( + vector> all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit) const { /* Given a vector of seeds and a limit, find a clustering of seeds where * seeds that are closer than the limit cluster together. @@ -37,7 +45,7 @@ cerr << endl << "New cluster calculation:" << endl; //This stores all the tree relationships and cluster information //for a single level of the snarl tree as it is being processed //It also keeps track of the parents of the current level - TreeState tree_state (&seeds, read_distance_limit, fragment_distance_limit); + TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit); //Populate tree_state.node_to_seeds (mapping each node to the seeds it //contains) and snarl_to_nodes_by_level @@ -108,25 +116,27 @@ cerr << endl << "New cluster calculation:" << endl; snarl_to_nodes) const { // Assign each seed to a node. - tree_state.node_to_seeds.reserve(tree_state.seeds->size()); + tree_state.node_to_seeds.reserve(tree_state.all_seeds->size()); for (size_t i = 0; i < tree_state.seeds->size(); i++) { - id_t id = get_id(tree_state.seeds->at(i)); - tree_state.node_to_seeds.emplace_back(id, i); - //For each seed, assign it to a node and the node to a snarl + for (size_t j = 0 ; j < tree_state.all_seeds[i].size() ; j++) { + id_t id = get_id(tree_state.all_seeds->at(i)->at(j)); + tree_state.node_to_seeds.emplace_back(id, i, j); + //For each seed, assign it to a node and the node to a snarl + } } std::sort(tree_state.node_to_seeds.begin(), tree_state.node_to_seeds.end()); // Assign each node to a snarl. id_t prev_node = -1; for (auto mapping : tree_state.node_to_seeds) { - if (mapping.first == prev_node) { + if (get<0>(mapping) == prev_node) { continue; } prev_node = mapping.first; - size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first); + size_t snarl_i = dist_index.getPrimaryAssignment(get<0>(mapping)); size_t depth = dist_index.snarl_indexes[snarl_i].depth; snarl_to_nodes[depth][snarl_i].emplace_back( - NetgraphNode(mapping.first, NODE), NodeClusters()); + NetgraphNode(get<0>(mapping), NODE), NodeClusters()); } } @@ -271,7 +281,7 @@ cerr << endl << "New cluster calculation:" << endl; auto seed_range_start = std::lower_bound( tree_state.node_to_seeds.begin(), tree_state.node_to_seeds.end(), - std::pair(node_id, 0)); + std::tuple(node_id, 0, 0)); //indices of union find group ids of clusters in this node NodeClusters node_clusters; @@ -287,7 +297,7 @@ cerr << endl << "New cluster calculation:" << endl; //And find the shortest distance from any seed to both //ends of the node - pos_t seed = tree_state.seeds->at(iter->second); + pos_t seed = tree_state.all_seeds->at(std::get<1>(*iter)).at(std::get<2>(*iter)); int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) : get_offset(seed) + 1; int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp index e7fad28642a..03c4d20346c 100644 --- a/src/seed_clusterer.hpp +++ b/src/seed_clusterer.hpp @@ -18,15 +18,24 @@ class SnarlSeedClusterer { //cluster the seeds such that two seeds whose minimum distance //between them (including both of the positions) is less than // the distance limit are in the same cluster - //If a fragment_distance_limit is give, then also cluster based on - //this distance for paired-end clusters. fragment_distance_limit - //must be greater than read_distance_limit - //If fragment_distance_limit is 0, then ignore it + // //Returns a vector of clusters. Each cluster is a vector of //indices into seeds - tuple>,vector>> cluster_seeds ( - vector seeds, + vector> cluster_seeds ( + vector seeds, int64_t read_distance_limit) const; + + ///The same thing, but for paired end reads. + //Given seeds from multiple reads of a fragment, cluster each set of seeds + //by the read distance and all seeds by the fragment distance limit + //fragment_distance_limit must be greater than read_distance_limit + //Returns clusters for each read and clusters of all the seeds in all reads + //The read clusters refer to seeds by their indexes in the input vectors of seeds + //The fragment clusters give seeds the index they would get if the vectors of + // seeds were appended to each other in the order given + tuple>>,vector>> cluster_seeds ( + vector> all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit=0) const; + private: MinimumDistanceIndex& dist_index; @@ -117,8 +126,11 @@ class SnarlSeedClusterer { //As clustering occurs at the current level, the parent level //is updated to know about its children - //Vector of all the seeds - vector* seeds; + //Vector of all the seeds for each read + vector>* all_seeds; + + //Vector of the offset of indices for each seed + vector seed_index_offsets; //The minimum distance between nodes for them to be put in the //same cluster @@ -129,7 +141,7 @@ class SnarlSeedClusterer { //////////Data structures to hold clustering information //Structure to hold the clustering of the seeds - structures::UnionFind read_union_find; + vector read_union_find; structures::UnionFind fragment_union_find; //For each seed, store the distances to the left and right ends @@ -146,7 +158,7 @@ class SnarlSeedClusterer { //Maps each node to a vector of the seeds that are contained in it //seeds are represented by indexes into the seeds vector //The array is sorted. - vector> node_to_seeds; + vector> node_to_seeds; //Map from snarl (index into dist_index.snarl_indexes) i //to the netgraph nodes contained in the snarl as well as the @@ -172,14 +184,19 @@ class SnarlSeedClusterer { parent_snarl_to_nodes; //Constructor takes in a pointer to the seeds and the distance limit - TreeState (vector* seeds, int64_t read_distance_limit, + TreeState (vector>* all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit) : - seeds(seeds), + all_seeds(all_seeds), read_cluster_dists(seeds->size(), make_pair(-1, -1)), read_union_find (seeds->size(), false), fragment_union_find (seeds->size(), false), read_distance_limit(read_distance_limit), fragment_distance_limit(fragment_distance_limit){ + seed_index_offsets.push_back(0); + for (auto& v : all_seeds) { + size_t offset = seed_index_offsets.back() + v.size(); + seed_index_offsets.push_back(offset); + } } }; From 0b9f3dd5bb98db4959cabc770422560f5e57de1f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 28 Oct 2019 15:51:32 -0400 Subject: [PATCH 05/79] Option for coverage threshold in augment --- src/augment.cpp | 158 ++++++++++++++++++++++++-------- src/augment.hpp | 33 ++++++- src/packer.cpp | 18 +++- src/packer.hpp | 11 ++- src/subcommand/augment_main.cpp | 61 ++++++++---- src/subcommand/pack_main.cpp | 9 +- 6 files changed, 224 insertions(+), 66 deletions(-) diff --git a/src/augment.cpp b/src/augment.cpp index bf725f87cb9..e38b2b2556b 100644 --- a/src/augment.cpp +++ b/src/augment.cpp @@ -3,6 +3,7 @@ #include "augment.hpp" #include "alignment.hpp" +#include "packer.hpp" //#define debug @@ -18,15 +19,21 @@ void augment(MutablePathMutableHandleGraph* graph, bool embed_paths, bool break_at_ends, bool remove_softclips, - bool filter_out_of_graph_alignments) { + bool filter_out_of_graph_alignments, + Packer* packer, + size_t min_bp_coverage) { - function, bool)> iterate_gam = - [&gam_stream] (function aln_callback, bool reset_stream) { + function, bool, bool)> iterate_gam = + [&gam_stream] (function aln_callback, bool reset_stream, bool parallel) { if (reset_stream) { gam_stream.clear(); gam_stream.seekg(0, ios_base::beg); } - vg::io::for_each(gam_stream, aln_callback); + if (parallel) { + vg::io::for_each_parallel(gam_stream, aln_callback, Packer::estimate_batch_size(get_thread_count())); + } else { + vg::io::for_each(gam_stream, aln_callback); + } }; augment_impl(graph, @@ -36,7 +43,9 @@ void augment(MutablePathMutableHandleGraph* graph, embed_paths, break_at_ends, remove_softclips, - filter_out_of_graph_alignments); + filter_out_of_graph_alignments, + packer, + min_bp_coverage); } void augment(MutablePathMutableHandleGraph* graph, @@ -46,15 +55,30 @@ void augment(MutablePathMutableHandleGraph* graph, bool embed_paths, bool break_at_ends, bool remove_softclips, - bool filter_out_of_graph_alignments) { + bool filter_out_of_graph_alignments, + Packer* packer, + size_t min_bp_coverage) { - function, bool)> iterate_gam = - [&path_vector] (function aln_callback, bool reset_stream) { - for (Path& path : path_vector) { - Alignment aln; - *aln.mutable_path() = path; - aln.set_name(path.name()); - aln_callback(aln); + function, bool, bool)> iterate_gam = + [&path_vector] (function aln_callback, bool reset_stream, bool parallel) { + if (parallel) { +#pragma omp parallel for + for (size_t i = 0; i < path_vector.size(); ++i) { + Path& path = path_vector[i]; + Alignment aln; + *aln.mutable_path() = path; + aln.set_name(path.name()); + aln_callback(aln); + } + + } + else { + for (Path& path : path_vector) { + Alignment aln; + *aln.mutable_path() = path; + aln.set_name(path.name()); + aln_callback(aln); + } } }; @@ -65,20 +89,28 @@ void augment(MutablePathMutableHandleGraph* graph, embed_paths, break_at_ends, remove_softclips, - filter_out_of_graph_alignments); + filter_out_of_graph_alignments, + packer, + min_bp_coverage); } void augment_impl(MutablePathMutableHandleGraph* graph, - function, bool)> iterate_gam, + function, bool, bool)> iterate_gam, vector* out_translations, ostream* gam_out_stream, bool embed_paths, bool break_at_ends, bool remove_softclips, - bool filter_out_of_graph_alignments) { - // Collect the breakpoints - unordered_map> breakpoints; + bool filter_out_of_graph_alignments, + Packer* packer, + size_t min_bp_coverage) { + // toggle between using Packer to store breakpoints or the STL map + bool packed_mode = min_bp_coverage > 0; + assert(!packed_mode || packer != nullptr); + + unordered_map> breakpoints; + // Check if alignment contains node that's not in the graph function check_in_graph = [&graph](const Path& path) { for (size_t i = 0; i < path.mapping_size(); ++i) { @@ -106,14 +138,21 @@ void augment_impl(MutablePathMutableHandleGraph* graph, // Mapping (because we don't have or want a breakpoint there) Path simplified_path = simplify(aln.path()); - // Add in breakpoints from each path - find_breakpoints(simplified_path, breakpoints, break_at_ends); - - }, false); - - // Invert the breakpoints that are on the reverse strand - breakpoints = forwardize_breakpoints(graph, breakpoints); + if (packed_mode) { + find_packed_breakpoints(simplified_path, *packer, break_at_ends); + } else { + find_breakpoints(simplified_path, breakpoints, break_at_ends); + } + }, false, packed_mode); + + if (packed_mode) { + // Filter the breakpoints by coverage + unordered_map> breakpoints = filter_breakpoints_by_coverage(*packer, min_bp_coverage); + } else { + // Invert the breakpoints that are on the reverse strand + breakpoints = forwardize_breakpoints(graph, breakpoints); + } // get the node sizes, for use when making the translation unordered_map orig_node_sizes; @@ -187,7 +226,7 @@ void augment_impl(MutablePathMutableHandleGraph* graph, gam_buffer.push_back(aln); vg::io::write_buffered(*gam_out_stream, gam_buffer, 100); } - }, true); + }, true, false); if (gam_out_stream != nullptr) { // Flush the buffer vg::io::write_buffered(*gam_out_stream, gam_buffer, 0); @@ -235,7 +274,6 @@ void augment_impl(MutablePathMutableHandleGraph* graph, } - // returns breakpoints on the forward strand of the nodes void find_breakpoints(const Path& path, unordered_map>& breakpoints, bool break_ends) { // We need to work out what offsets we will need to break each node at, if @@ -332,15 +370,6 @@ void find_breakpoints(const Path& path, unordered_map>& breakpo } -path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path) { - path_handle_t path_handle = graph->create_path_handle(path.name(), path.is_circular()); - for (int i = 0; i < path.mapping_size(); ++i) { - graph->append_step(path_handle, graph->get_handle(path.mapping(i).position().node_id(), - path.mapping(i).position().is_reverse())); - } - return path_handle; -} - unordered_map> forwardize_breakpoints(const HandleGraph* graph, const unordered_map>& breakpoints) { unordered_map> fwd; @@ -372,6 +401,63 @@ unordered_map> forwardize_breakpoints(const HandleGraph* graph, return fwd; } + +// returns breakpoints on the forward strand of the nodes +void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends) { + // use existing methods to find the breakpoints, then copy them into a packer + // todo: streamline? + unordered_map> breakpoints; + find_breakpoints(path, breakpoints, break_ends); + breakpoints = forwardize_breakpoints(packed_breakpoints.get_graph(), breakpoints); + const HandleGraph* graph = packed_breakpoints.get_graph(); + for (auto& id_set : breakpoints) { + size_t node_len = graph->get_length(graph->get_handle(id_set.first)); + Position position; + position.set_node_id(id_set.first); + for (auto pos : id_set.second) { + size_t offset = get_offset(pos); + if (offset < node_len - 1) { + position.set_offset(offset); + packed_breakpoints.increment_coverage(packed_breakpoints.position_in_basis(position)); + } + } + } +} + +unordered_map> filter_breakpoints_by_coverage(const Packer& packed_breakpoints, size_t min_bp_coverage) { + vector>> bp_maps(get_thread_count()); + size_t n = packed_breakpoints.coverage_size(); + const VectorizableHandleGraph* vec_graph = dynamic_cast(packed_breakpoints.get_graph()); + // we assume our position vector is much larger than the number of filtered breakpoints + // and scan it in parallel in a first pass +#pragma omp parallel for + for (size_t i = 0; i < n; ++i) { + if (packed_breakpoints.coverage_at_position(i) >= min_bp_coverage) { + auto& bp_map = bp_maps[omp_get_thread_num()]; + nid_t node_id = vec_graph->node_at_vector_offset(i+1); + size_t offset = i - vec_graph->node_vector_offset(node_id); + bp_map[node_id].insert(make_pos_t(node_id, false, offset)); + } + } + // then collect up the breakpoints sequentially in a second pass + for (size_t i = 1; i < bp_maps.size(); ++i) { + for (auto& kv : bp_maps[i]) { + bp_maps[0][kv.first].insert(kv.second.begin(), kv.second.end()); + } + } + return bp_maps[0]; +} + + +path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path) { + path_handle_t path_handle = graph->create_path_handle(path.name(), path.is_circular()); + for (int i = 0; i < path.mapping_size(); ++i) { + graph->append_step(path_handle, graph->get_handle(path.mapping(i).position().node_id(), + path.mapping(i).position().is_reverse())); + } + return path_handle; +} + map ensure_breakpoints(MutableHandleGraph* graph, const unordered_map>& breakpoints) { // Set up the map we will fill in with the new node start positions in the diff --git a/src/augment.hpp b/src/augment.hpp index ac42760ca79..3e818003f38 100644 --- a/src/augment.hpp +++ b/src/augment.hpp @@ -11,7 +11,9 @@ #include "handle.hpp" namespace vg { - + +class Packer; + using namespace std; /// %Edit the graph to include all the sequence and edges added by the given @@ -40,7 +42,9 @@ void augment(MutablePathMutableHandleGraph* graph, bool embed_paths = false, bool break_at_ends = false, bool remove_soft_clips = false, - bool filter_out_of_graph_alignments = false); + bool filter_out_of_graph_alignments = false, + Packer* packer = nullptr, + size_t min_bp_coverage = 0); /// Like above, but operates on a vector of Alignments, instead of a stream /// (Note: It is best to use stream interface for large numbers of alignments to save memory) @@ -51,23 +55,34 @@ void augment(MutablePathMutableHandleGraph* graph, bool embed_paths = false, bool break_at_ends = false, bool remove_soft_clips = false, - bool filter_out_of_graph_alignments = false); + bool filter_out_of_graph_alignments = false, + Packer* packer = nullptr, + size_t min_bp_coverage = 0); /// Generic version used to implement the above two methods. void augment_impl(MutablePathMutableHandleGraph* graph, - function, bool)> iterate_gam, + function, bool, bool)> iterate_gam, vector* out_translation, ostream* gam_out_stream, bool embed_paths, bool break_at_ends, bool remove_soft_clips, - bool filter_out_of_graph_alignments); + bool filter_out_of_graph_alignments, + Packer* packer, + size_t min_bp_coverage); /// Add a path to the graph. This is like VG::extend, and expects /// a path with no edits, and for all the nodes and edges in the path /// to exist exactly in the graph path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path); +/// Find all the points at which a Path enters or leaves nodes in the graph. Adds +/// them to the given map by node ID of sets of bases in the node that will need +/// to become the starts of new nodes. +/// +/// If break_ends is true, emits breakpoints at the ends of the path, even +/// if it starts/ends with perfect matches. + /// Find all the points at which a Path enters or leaves nodes in the graph. Adds /// them to the given map by node ID of sets of bases in the node that will need /// to become the starts of new nodes. @@ -80,6 +95,14 @@ void find_breakpoints(const Path& path, unordered_map>& breakpo unordered_map> forwardize_breakpoints(const HandleGraph* graph, const unordered_map>& breakpoints); + +/// Like "find_breakpoints", but store in packed structure (better for large gams and enables coverage filter) +void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends = true); + +/// Filters the breakpoints by coverage, and converts them back from the Packer to the STL map +/// expected by following methods +unordered_map> filter_breakpoints_by_coverage(const Packer& packed_breakpoints, size_t min_bp_coverage); + /// Take a map from node ID to a set of offsets at which new nodes should /// start (which may include 0 and 1-past-the-end, which should be ignored), /// break the specified nodes at those positions. Returns a map from old diff --git a/src/packer.cpp b/src/packer.cpp index 38100274611..ca81cfcc19e 100644 --- a/src/packer.cpp +++ b/src/packer.cpp @@ -9,6 +9,22 @@ namespace vg { const int Packer::maximum_quality = 60; const int Packer::lru_cache_size = 4096; +size_t Packer::estimate_data_width(size_t expected_coverage) { + return std::ceil(std::log2(2 * expected_coverage)); +} + +size_t Packer::estimate_batch_size(size_t num_threads) { + size_t batch_size = max((size_t)128, (size_t)(pow(2, 14 - log2(num_threads)))); + if (batch_size % 2 != 0) { + ++batch_size; + } + return batch_size; +} + +size_t Packer::estimate_bin_count(size_t num_threads) { + return pow(2, log2(num_threads) + 14); +} + Packer::Packer(void) : graph(nullptr), data_width(8), cov_bin_size(0), edge_cov_bin_size(0), num_bases_dynamic(0), base_locks(nullptr), num_edges_dynamic(0), edge_locks(nullptr), tmpfstream_locks(nullptr) { } Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins, size_t data_width, bool record_bases, bool record_edges, bool record_edits) : @@ -527,7 +543,7 @@ string Packer::unescape_delim(const string& s, char d) const { return unescaped; } -size_t Packer::coverage_size(void) { +size_t Packer::coverage_size(void) const { if (is_compacted){ return coverage_civ.size(); } diff --git a/src/packer.hpp b/src/packer.hpp index 5d7ad5e9910..dfece198320 100644 --- a/src/packer.hpp +++ b/src/packer.hpp @@ -30,6 +30,12 @@ using namespace sdsl; /// In memory, the coverages are stored in SDSL int vectors (dynamic) and on disk they are compressed int vectors class Packer { public: + + /// Some helper functions to heuristically estimate input parameters for constructor + static size_t estimate_data_width(size_t expected_coverage); + static size_t estimate_batch_size(size_t num_threads); + static size_t estimate_bin_count(size_t num_threads); + Packer(void); /// Create a Packer /// graph : Must implement the VectorizableHandleGraph interface @@ -75,7 +81,7 @@ class Packer { size_t get_n_bins(void) const; bool is_dynamic(void) const; const HandleGraph* get_graph() const; - size_t coverage_size(void); + size_t coverage_size(void) const ; void increment_coverage(size_t i); void increment_coverage(size_t i, size_t v); @@ -84,7 +90,8 @@ class Packer { size_t edge_vector_size(void) const; size_t edge_index(const Edge& e) const; void increment_edge_coverage(size_t i); - void increment_edge_coverage(size_t i, size_t v); + void increment_edge_coverage(size_t i, size_t v); + private: /// map from absolute postion to positions in the binned arrays pair coverage_bin_offset(size_t i) const; diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp index 9b1959295d3..abd0cdaa586 100644 --- a/src/subcommand/augment_main.cpp +++ b/src/subcommand/augment_main.cpp @@ -25,12 +25,14 @@ #include "../xg.hpp" #include "../vg.hpp" #include "../augment.hpp" +#include "../packer.hpp" #include #include #include #include "bdsg/packed_graph.hpp" #include "bdsg/hash_graph.hpp" #include "bdsg/odgi.hpp" +#include using namespace std; using namespace vg; @@ -47,6 +49,8 @@ void help_augment(char** argv, ConfigurableParser& parser) { << " -Z, --translation FILE save translations from augmented back to base graph to FILE" << endl << " -A, --alignment-out FILE save augmented GAM reads to FILE" << endl << " -s, --subgraph graph is a subgraph of the one used to create GAM. ignore alignments with missing nodes" << endl + << " -m, --min-coverage N minimum coverage of a breakpoint required for it to be added to the graph" << endl + << " -c, --expected-cov N expected coverage. used only for memory tuning [default : 128]" << endl << " -h, --help print this help message" << endl << " -p, --progress show progress" << endl << " -v, --verbose print information and warnings about vcf generation" << endl @@ -89,15 +93,19 @@ int main_augment(int argc, char** argv) { // fail when nodes are missing bool is_subgraph = false; + // Min coverage for graph to be broken at a breakpoint + // Whene non-zero, the Packer will be used to collect breakpoints + size_t min_coverage = 0; + + // Used to set data_width for Packer + size_t expected_coverage = 128; + // Print some progress messages to screen bool show_progress = false; // Print verbose message bool verbose = false; - // Number of threads to use (will default to all if not specified) - int thread_count = 0; - static const struct option long_options[] = { // Deprecated Options {"augmentation-mode", required_argument, 0, 'a'}, @@ -108,6 +116,8 @@ int main_augment(int argc, char** argv) { {"cut-softclips", no_argument, 0, 'C'}, {"label-paths", no_argument, 0, 'B'}, {"subgraph", no_argument, 0, 's'}, + {"min-coverage", required_argument, 0, 'm'}, + {"expected-cov", required_argument, 0, 'c'}, {"help", no_argument, 0, 'h'}, {"progress", required_argument, 0, 'p'}, {"verbose", no_argument, 0, 'v'}, @@ -117,7 +127,7 @@ int main_augment(int argc, char** argv) { {"include-gt", required_argument, 0, 'L'}, {0, 0, 0, 0} }; - static const char* short_options = "a:Z:A:iCBhpvt:l:L:s"; + static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:"; optind = 2; // force optind past command positional arguments // This is our command-line parser @@ -148,6 +158,12 @@ int main_augment(int argc, char** argv) { case 's': is_subgraph = true; break; + case 'm': + min_coverage = parse(optarg); + break; + case 'c': + expected_coverage = parse(optarg); + break; case 'h': case '?': /* getopt_long already printed an error message. */ @@ -159,12 +175,18 @@ int main_augment(int argc, char** argv) { break; case 'v': verbose = true; - break; + break; case 't': - thread_count = parse(optarg); + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; - - // Loci Options + } + // Loci Options case 'l': loci_file = optarg; break; @@ -181,12 +203,6 @@ int main_augment(int argc, char** argv) { // Parse the command line options, updating optind. parser.parse(argc, argv); - if (thread_count != 0) { - // Use a non-default number of threads - omp_set_num_threads(thread_count); - } - thread_count = get_thread_count(); - // Parse the two positional arguments if (optind + 1 > argc) { cerr << "[vg augment] error: too few arguments" << endl; @@ -227,6 +243,15 @@ int main_augment(int argc, char** argv) { graph = vg::io::VPKG::load_one(in); }); VG* vg_graph = dynamic_cast(graph.get()); + HandleGraph* vectorizable_graph = nullptr; + unique_ptr packer; + bdsg::VectorizableOverlayHelper overlay_helper; + if (min_coverage > 0) { + vectorizable_graph = dynamic_cast(overlay_helper.apply(graph.get())); + size_t data_width = Packer::estimate_data_width(expected_coverage); + size_t bin_count = Packer::estimate_bin_count(get_thread_count()); + packer = make_unique(vectorizable_graph, 0, bin_count, data_width, true, false, false); + } if (label_paths) { // Just add path names with extend() @@ -302,7 +327,9 @@ int main_augment(int argc, char** argv) { include_paths, include_paths, !include_softclips, - is_subgraph); + is_subgraph, + packer.get(), + min_coverage); } else { // much better to stream from a file so we can do two passes without storing in memory get_input_file(gam_in_file_name, [&](istream& alignment_stream) { @@ -313,7 +340,9 @@ int main_augment(int argc, char** argv) { include_paths, include_paths, !include_softclips, - is_subgraph); + is_subgraph, + packer.get(), + min_coverage); }); } diff --git a/src/subcommand/pack_main.cpp b/src/subcommand/pack_main.cpp index 79b8f5af8d6..fc75b273290 100644 --- a/src/subcommand/pack_main.cpp +++ b/src/subcommand/pack_main.cpp @@ -189,17 +189,14 @@ int main_pack(int argc, char** argv) { // get a data width from our expected coverage, using simple heuristic of counting // bits needed to store double the coverage - size_t data_width = std::ceil(std::log2(2 * expected_coverage)); + size_t data_width = Packer::estimate_data_width(expected_coverage); // use some naive heuristics to come up with bin count and batch size based on thread count // more bins: finer grained parallelism at cost of more mutexes and allocations // bigger batch size: more robustness to sorted input at cost of less parallelism size_t num_threads = get_thread_count(); - size_t batch_size = max((size_t)128, (size_t)(pow(2, 14 - log2(num_threads)))); - if (batch_size % 2 != 0) { - ++batch_size; - } - size_t bin_count = pow(2, log2(num_threads) + 14); + size_t batch_size = Packer::estimate_batch_size(num_threads); + size_t bin_count = Packer::estimate_bin_count(num_threads); // create our packer Packer packer(graph, bin_size, bin_count, data_width, true, true, record_edits); From 2696ec4a9cd9a3ea975b45357599ce2731fbbbd7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 29 Oct 2019 13:59:05 -0400 Subject: [PATCH 06/79] fixes and tests for augment coverages threshold --- src/augment.cpp | 140 ++++++++++++++++++++++++++++++++--------- src/augment.hpp | 6 ++ src/packer.cpp | 18 ++++-- src/packer.hpp | 3 +- test/t/17_vg_augment.t | 21 ++++++- 5 files changed, 153 insertions(+), 35 deletions(-) diff --git a/src/augment.cpp b/src/augment.cpp index e38b2b2556b..468ab937d3b 100644 --- a/src/augment.cpp +++ b/src/augment.cpp @@ -148,12 +148,17 @@ void augment_impl(MutablePathMutableHandleGraph* graph, if (packed_mode) { // Filter the breakpoints by coverage - unordered_map> breakpoints = filter_breakpoints_by_coverage(*packer, min_bp_coverage); + breakpoints = filter_breakpoints_by_coverage(*packer, min_bp_coverage); } else { // Invert the breakpoints that are on the reverse strand breakpoints = forwardize_breakpoints(graph, breakpoints); } + // don't need this anymore: free up some memory + if (packer != nullptr) { + packer->clear(); + } + // get the node sizes, for use when making the translation unordered_map orig_node_sizes; orig_node_sizes.reserve(graph->get_node_count()); @@ -190,6 +195,12 @@ void augment_impl(MutablePathMutableHandleGraph* graph, // the input paths in memory Path simplified_path = simplify(aln.path()); + // Filter out edits corresponding to breakpoints that didn't meet our coverage + // criteria + if (min_bp_coverage > 0) { + simplify_filtered_edits(graph, simplified_path, node_translation, orig_node_sizes); + } + // Now go through each new path again, by reference so we can overwrite. // Create new nodes/wire things up. Get the added version of the path. @@ -445,6 +456,7 @@ unordered_map> filter_breakpoints_by_coverage(const Packer& pac bp_maps[0][kv.first].insert(kv.second.begin(), kv.second.end()); } } + return bp_maps[0]; } @@ -551,6 +563,100 @@ map ensure_breakpoints(MutableHandleGraph* graph, return toReturn; } +// We use this function to get the id of the node that contains a position on an +// original node. +static nid_t find_new_node(HandleGraph* graph, pos_t old_pos, const map& node_translation) { + if(node_translation.find(make_pos_t(id(old_pos), false, 0)) == node_translation.end()) { + // The node is unchanged + return id(old_pos); + } + // Otherwise, get the first new node starting after that position, and + // then look left. + auto found = node_translation.upper_bound(old_pos); + assert(found != node_translation.end()); + if (id(found->first) != id(old_pos) + || is_rev(found->first) != is_rev(old_pos)) { + return id_t(0); + } + // Get the thing before that (last key <= the position we want + --found; + assert(graph->has_node(found->second)); + + // Return the node we found. + return found->second; +}; + + +void simplify_filtered_edits(HandleGraph* graph, Path& path, const map& node_translation, + const unordered_map& orig_node_sizes) { + + // check if an edit position is chopped at its next or prev position + auto is_chopped = [&](pos_t edit_position, bool look_next) { + // are we adding to the offset? + bool forward = look_next != is_rev(edit_position); + bool chopped = true; + if (forward) { + // check if its chopped in the original graph + chopped = offset(edit_position) == orig_node_sizes.find(id(edit_position))->second - 1; + // check if its chopped in the translation + if (!chopped) { + auto edit_next_position = edit_position; + ++get_offset(edit_next_position); + chopped = find_new_node(graph, edit_position, node_translation) != find_new_node(graph, edit_next_position, node_translation); + } + } else { + // check if its chopped in the original graph + chopped = offset(edit_position) == 0; + // check if its chopped in the translation + if (!chopped) { + auto edit_prev_position = edit_position; + --get_offset(edit_prev_position); + chopped = find_new_node(graph, edit_position, node_translation) != find_new_node(graph, edit_prev_position, node_translation); + } + } + return chopped; + }; + + bool path_modified = false; + + for (size_t i = 0; i < path.mapping_size(); ++i) { + // For each Mapping in the path + Mapping& m = *path.mutable_mapping(i); + + // What node are we on? In old node ID space. + id_t node_id = m.position().node_id(); + + // See where the next edit starts in the node. It is always included + // (even when the edit runs backward), unless the edit has 0 length in + // the reference. + pos_t edit_first_position = make_pos_t(m.position()); + + for(size_t j = 0; j < m.edit_size(); ++j) { + // For each Edit in the mapping + Edit& e = *m.mutable_edit(j); + + // Work out where its end position on the original node is (inclusive) + // We don't use this on insertions, so 0-from-length edits don't matter. + pos_t edit_last_position = edit_first_position; + get_offset(edit_last_position) += (e.from_length()?e.from_length()-1:0); + + // skip edits whose breakpoitns weren't added due to the coverage filter + if (!edit_is_match(e) && (!is_chopped(edit_first_position, true) || !is_chopped(edit_last_position, false))) { + e.set_to_length(e.from_length()); + e.set_sequence(""); + path_modified = true; + } + + // Advance in the right direction along the original node for this edit. + // This way the next one will start at the right place. + get_offset(edit_first_position) += e.from_length(); + } + } + + if (path_modified) { + path = simplify(path); + } +} Path add_nodes_and_edges(MutableHandleGraph* graph, const Path& path, @@ -572,6 +678,7 @@ Path add_nodes_and_edges(MutableHandleGraph* graph, } + Path add_nodes_and_edges(MutableHandleGraph* graph, const Path& path, const map& node_translation, @@ -604,36 +711,13 @@ Path add_nodes_and_edges(MutableHandleGraph* graph, Path embedded; embedded.set_name(path.name()); - // We use this function to get the id of the node that contains a position on an - // original node. - auto find_new_node = [&](pos_t old_pos) { - if(node_translation.find(make_pos_t(id(old_pos), false, 0)) == node_translation.end()) { - // The node is unchanged - return id(old_pos); - } - // Otherwise, get the first new node starting after that position, and - // then look left. - auto found = node_translation.upper_bound(old_pos); - assert(found != node_translation.end()); - if (id(found->first) != id(old_pos) - || is_rev(found->first) != is_rev(old_pos)) { - return id_t(0); - } - // Get the thing before that (last key <= the position we want - --found; - assert(graph->has_node(found->second)); - - // Return the node we found. - return found->second; - }; - auto create_new_mappings = [&](pos_t p1, pos_t p2, bool is_rev) { vector mappings; vector nodes; for (pos_t p = p1; p <= p2; ++get_offset(p)) { - auto n = find_new_node(p); + auto n = find_new_node(graph, p, node_translation); assert(n != 0); - nodes.push_back(find_new_node(p)); + nodes.push_back(find_new_node(graph, p, node_translation)); } auto np = nodes.begin(); while (np != nodes.end()) { @@ -858,8 +942,8 @@ Path add_nodes_and_edges(MutableHandleGraph* graph, // have additional breakpoints in the middle. So we need the // left node, that contains the first base of the match, and the // right node, that contains the last base of the match. - id_t left_node = find_new_node(edit_first_position); - id_t right_node = find_new_node(edit_last_position); + id_t left_node = find_new_node(graph, edit_first_position, node_translation); + id_t right_node = find_new_node(graph, edit_last_position, node_translation); // TODO: we just assume the outer edges of these nodes are in // the right places. They should be if we cut the breakpoints diff --git a/src/augment.hpp b/src/augment.hpp index 3e818003f38..9fc8f5ac4f4 100644 --- a/src/augment.hpp +++ b/src/augment.hpp @@ -117,6 +117,12 @@ unordered_map> filter_breakpoints_by_coverage(const Packer& pac map ensure_breakpoints(MutableHandleGraph* graph, const unordered_map>& breakpoints); +/// Remove edits in our graph that don't correspond to breakpoints (ie were effectively filtered +/// out due to insufficient coverage. This way, subsequent logic in add_nodes_and_edges +/// can be run correctly +void simplify_filtered_edits(HandleGraph* graph, Path& path, const map& node_translation, + const unordered_map& orig_node_sizes); + /// Given a path on nodes that may or may not exist, and a map from start /// position in the old graph to a node in the current graph, add all the /// new sequence and edges required by the path. The given path must not diff --git a/src/packer.cpp b/src/packer.cpp index ca81cfcc19e..6f14e79dd46 100644 --- a/src/packer.cpp +++ b/src/packer.cpp @@ -76,23 +76,33 @@ Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins, } } -Packer::~Packer(void) { - for (auto counter : coverage_dynamic) { +void Packer::clear() { + for (auto& counter : coverage_dynamic) { delete counter; + counter = nullptr; } - for (auto counter : edge_coverage_dynamic) { + for (auto& counter : edge_coverage_dynamic) { delete counter; + counter = nullptr; } delete [] base_locks; + base_locks = nullptr; delete [] edge_locks; + edge_locks = nullptr; delete [] tmpfstream_locks; + tmpfstream_locks = nullptr; close_edit_tmpfiles(); remove_edit_tmpfiles(); - for (auto lru_cache : quality_cache) { + for (auto& lru_cache : quality_cache) { delete lru_cache; + lru_cache = nullptr; } } +Packer::~Packer() { + clear(); +} + void Packer::load_from_file(const string& file_name) { ifstream in(file_name); if (!in) { diff --git a/src/packer.hpp b/src/packer.hpp index dfece198320..7a02bb34f9c 100644 --- a/src/packer.hpp +++ b/src/packer.hpp @@ -47,7 +47,8 @@ class Packer { /// record_edges : Store the edge coverage /// record_edits : Store the edits Packer(const HandleGraph* graph, size_t bin_size = 0, size_t coverage_bins = 1, size_t data_width = 8, bool record_bases = true, bool record_edges = true, bool record_edits = true); - ~Packer(void); + ~Packer(); + void clear(); /// Add coverage from given alignment to the indexes /// aln : given alignemnt diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t index e252db9f274..0510fe2026f 100644 --- a/test/t/17_vg_augment.t +++ b/test/t/17_vg_augment.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 15 +plan tests 18 vg view -J -v pileup/tiny.json > tiny.vg @@ -18,7 +18,11 @@ vg augment -a direct tiny.vg edits.gam -A edits-embedded.gam > augmented.vg is "$(vg view -aj edits-embedded.gam | jq -c '.path.mapping[].edit[].sequence' | grep null | wc -l)" "36" "direct augmentation embeds reads fully for well-supported SNPs" is "$(vg stats -N augmented.vg)" "18" "adding a well-supported SNP by direct augmentation adds 3 more nodes" -rm -f edits.gam edits-embedded.gam augmented.vg +# Run again but with packed logic. output should be identical with min threshold of 1 +vg augment -a direct tiny.vg edits.gam -A edits-embedded.gam -m 1 > augmented.m1.vg +is "$(vg stats -N augmented.m1.vg)" "18" "adding a well-supported SNP by direct augmentation adds 3 more nodes with -m 1" + +rm -f edits.gam edits-embedded.gam augmented.vg augmented.m1.vg # Make sure every edit is augmented in vg view -J -a -G pileup/edit.json > edit.gam @@ -74,6 +78,18 @@ vg index -x flat.xg -g flat.gcsa -k 16 flat.vg vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam is $(vg augment flat.vg 2snp.gam -i | vg mod -D - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes" +vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTATCTGGAGTTCTATTATATCCCAACTCTCTG/' | vg view -Fv - >2err.vg +vg sim -l 30 -x 2err.vg -n 10 -a >2err.sim +vg map -g flat.gcsa -x flat.xg -G 2err.sim -k 8 >2err.gam +cat 2snp.gam 2err.gam > 4edits.gam +vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort > 2snp_default.nodes +vg augment flat.vg 2snp.gam -m 1 | vg view - | grep S | awk '{print $3}' | sort > 2snp_m1.nodes +diff 2snp_default.nodes 2snp_m1.nodes +is "$?" 0 "augmenting 2 snps with -m 1 produces the same nodes as default" +vg augment flat.vg 4edits.gam -m 11 | vg view - | grep S | awk '{print $3}' | sort > 4edits_m11.nodes +diff 2snp_default.nodes 4edits_m11.nodes +is "$?" 0 "augmenting 2 snps and 2 errors with -m 11 produces the same nodes as with just the snps" + vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort > vg_augment.nodes vg convert flat.vg -p > flat.pg vg augment flat.pg 2snp.gam | vg convert -v - | vg view - | grep S | awk '{print $3}' | sort > packed_graph_augment.nodes @@ -85,3 +101,4 @@ diff vg_augment.nodes hash_graph_augment.nodes is "$?" 0 "augmenting a hash graph produces same results as a vg graph" rm -f flat.vg flat.gcsa flat.xg flat.pg flat.hg 2snp.vg 2snp.xg 2snp.sim 2snp.gam vg_augment.nodes packed_graph_augment.nodes hash_graph_augment.nodes +rm -f 2err.sim 2err.gam 4edits.gam 2snp_default.nodes 2snp_m1.nodes 4edits_m11.nodes From c3de54c22f90e1723b7b2b2cc65a27a706494719 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 29 Oct 2019 14:39:12 -0400 Subject: [PATCH 07/79] dont process trivial paths --- src/augment.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/augment.cpp b/src/augment.cpp index 468ab937d3b..b17eb3212e7 100644 --- a/src/augment.cpp +++ b/src/augment.cpp @@ -617,7 +617,8 @@ void simplify_filtered_edits(HandleGraph* graph, Path& path, const map Date: Tue, 29 Oct 2019 16:51:15 -0400 Subject: [PATCH 08/79] tests check that edits below coverage disappear in -A and -i output --- src/augment.cpp | 71 ++++++++++++++++++++++-------------------- src/augment.hpp | 4 +-- test/t/17_vg_augment.t | 10 ++++-- 3 files changed, 47 insertions(+), 38 deletions(-) diff --git a/src/augment.cpp b/src/augment.cpp index b17eb3212e7..771d31e4784 100644 --- a/src/augment.cpp +++ b/src/augment.cpp @@ -197,45 +197,49 @@ void augment_impl(MutablePathMutableHandleGraph* graph, // Filter out edits corresponding to breakpoints that didn't meet our coverage // criteria + bool has_edits = true; if (min_bp_coverage > 0) { - simplify_filtered_edits(graph, simplified_path, node_translation, orig_node_sizes); + has_edits = simplify_filtered_edits(graph, simplified_path, node_translation, orig_node_sizes); } // Now go through each new path again, by reference so we can overwrite. + // but only if we have a reason to + if (has_edits || gam_out_stream != nullptr || embed_paths) { - // Create new nodes/wire things up. Get the added version of the path. - Path added = add_nodes_and_edges(graph, simplified_path, node_translation, added_seqs, - added_nodes, orig_node_sizes); + // Create new nodes/wire things up. Get the added version of the path. + Path added = add_nodes_and_edges(graph, simplified_path, node_translation, added_seqs, + added_nodes, orig_node_sizes); - // Copy over the name - *added.mutable_name() = aln.name(); + // Copy over the name + *added.mutable_name() = aln.name(); - if (embed_paths) { - add_path_to_graph(graph, added); - } + if (embed_paths) { + add_path_to_graph(graph, added); + } - // something is off about this check. - // assuming the GAM path is sorted, let's double-check that its edges are here - for (size_t i = 1; i < added.mapping_size(); ++i) { - auto& m1 = added.mapping(i-1); - auto& m2 = added.mapping(i); - // we're no longer sorting our input paths, so we assume they are sorted - assert((m1.rank() == 0 && m2.rank() == 0) || (m1.rank() + 1 == m2.rank())); - //if (!adjacent_mappings(m1, m2)) continue; // the path is completely represented here - auto s1 = graph->get_handle(m1.position().node_id(), m1.position().is_reverse()); - auto s2 = graph->get_handle(m2.position().node_id(), m2.position().is_reverse()); - // check that we always have an edge between the two nodes in the correct direction - if (!graph->has_edge(s1, s2)) { - // force these edges in - graph->create_edge(s1, s2); + // something is off about this check. + // assuming the GAM path is sorted, let's double-check that its edges are here + for (size_t i = 1; i < added.mapping_size(); ++i) { + auto& m1 = added.mapping(i-1); + auto& m2 = added.mapping(i); + // we're no longer sorting our input paths, so we assume they are sorted + assert((m1.rank() == 0 && m2.rank() == 0) || (m1.rank() + 1 == m2.rank())); + //if (!adjacent_mappings(m1, m2)) continue; // the path is completely represented here + auto s1 = graph->get_handle(m1.position().node_id(), m1.position().is_reverse()); + auto s2 = graph->get_handle(m2.position().node_id(), m2.position().is_reverse()); + // check that we always have an edge between the two nodes in the correct direction + if (!graph->has_edge(s1, s2)) { + // force these edges in + graph->create_edge(s1, s2); + } } - } - // optionally write out the modified path to GAM - if (gam_out_stream != nullptr) { - *aln.mutable_path() = added; - gam_buffer.push_back(aln); - vg::io::write_buffered(*gam_out_stream, gam_buffer, 100); + // optionally write out the modified path to GAM + if (gam_out_stream != nullptr) { + *aln.mutable_path() = added; + gam_buffer.push_back(aln); + vg::io::write_buffered(*gam_out_stream, gam_buffer, 100); + } } }, true, false); if (gam_out_stream != nullptr) { @@ -587,7 +591,7 @@ static nid_t find_new_node(HandleGraph* graph, pos_t old_pos, const map& node_translation, +bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map& node_translation, const unordered_map& orig_node_sizes) { // check if an edit position is chopped at its next or prev position @@ -658,13 +662,12 @@ void simplify_filtered_edits(HandleGraph* graph, Path& path, const map ensure_breakpoints(MutableHandleGraph* graph, /// Remove edits in our graph that don't correspond to breakpoints (ie were effectively filtered /// out due to insufficient coverage. This way, subsequent logic in add_nodes_and_edges -/// can be run correctly -void simplify_filtered_edits(HandleGraph* graph, Path& path, const map& node_translation, +/// can be run correctly. Returns true if at least one edit survived the filter. +bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map& node_translation, const unordered_map& orig_node_sizes); /// Given a path on nodes that may or may not exist, and a map from start diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t index 0510fe2026f..109927fe822 100644 --- a/test/t/17_vg_augment.t +++ b/test/t/17_vg_augment.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 18 +plan tests 21 vg view -J -v pileup/tiny.json > tiny.vg @@ -43,6 +43,12 @@ vg index -k 11 -g t.idx.gcsa -x t.idx.xg t.vg is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i | vg view - | grep ^S | wc -l) 1 "path inclusion does not modify the graph when alignment is a perfect match" +is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGGAGTTCTAATATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i -m 2 | vg view - | grep ^S | wc -l) 1 "path inclusion does not modify the graph when alignment has a SNP but doesnt meet the coverage threshold" + +is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGGAGTTCTAATATATTCCAACTCTCTG -V read -d t.idx | vg augment t.vg - -i -m 2 -A read_aug.gam | vg view - | grep ^P | awk '{print $4}' | uniq) "50M" "path inclusion does not modify the included path when alignment has a SNP but doesnt meet the coverage threshold" + +is $(vg view -a read_aug.gam | jq. | grep edit | wc) 1 "output GAM has single edit when SNP was filtered out due to coverage" + is $(vg map -s CAAATAAGGCTTGGAAAGGGTTTCTGGAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i | vg view - | grep ^S | wc -l) 5 "path inclusion with a complex variant introduces the right number of nodes" # checks that we get a node with the id 4, which is the ref-matching dual to the deletion @@ -51,7 +57,7 @@ is $(vg map -s CAAAAAGGCTTGGAAAGGGTTTCTGGAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGCAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i | vg view - | grep ^S | wc -l) 4 "SNPs can be included in the graph" rm t.vg -rm -rf t.idx.xg t.idx.gcsa +rm -rf t.idx.xg t.idx.gcsa read_aug.gam vg construct -v tiny/tiny.vcf.gz -r tiny/tiny.fa >t.vg vg align -s GGGGGGGAAATTTTCTGGAGTTCTATTATATTCCAAAAAAAAAA t.vg >t.gam From 66d4f4895d45c16ffe355de55fc026402d706f20 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 30 Oct 2019 14:00:05 -0400 Subject: [PATCH 09/79] quality filters for augment --- src/augment.cpp | 106 ++++++++++++++++++++++---------- src/augment.hpp | 22 ++++++- src/subcommand/augment_main.cpp | 32 ++++++++-- test/t/17_vg_augment.t | 20 +++++- 4 files changed, 138 insertions(+), 42 deletions(-) diff --git a/src/augment.cpp b/src/augment.cpp index 771d31e4784..dda25626556 100644 --- a/src/augment.cpp +++ b/src/augment.cpp @@ -20,6 +20,8 @@ void augment(MutablePathMutableHandleGraph* graph, bool break_at_ends, bool remove_softclips, bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, Packer* packer, size_t min_bp_coverage) { @@ -44,6 +46,8 @@ void augment(MutablePathMutableHandleGraph* graph, break_at_ends, remove_softclips, filter_out_of_graph_alignments, + min_baseq, + min_mapq, packer, min_bp_coverage); } @@ -56,6 +60,8 @@ void augment(MutablePathMutableHandleGraph* graph, bool break_at_ends, bool remove_softclips, bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, Packer* packer, size_t min_bp_coverage) { @@ -90,6 +96,8 @@ void augment(MutablePathMutableHandleGraph* graph, break_at_ends, remove_softclips, filter_out_of_graph_alignments, + min_baseq, + min_mapq, packer, min_bp_coverage); } @@ -102,11 +110,13 @@ void augment_impl(MutablePathMutableHandleGraph* graph, bool break_at_ends, bool remove_softclips, bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, Packer* packer, size_t min_bp_coverage) { // toggle between using Packer to store breakpoints or the STL map - bool packed_mode = min_bp_coverage > 0; + bool packed_mode = min_bp_coverage > 0 || min_baseq > 0; assert(!packed_mode || packer != nullptr); unordered_map> breakpoints; @@ -126,7 +136,7 @@ void augment_impl(MutablePathMutableHandleGraph* graph, #ifdef debug cerr << pb2json(aln.path()) << endl; #endif - if (filter_out_of_graph_alignments && !check_in_graph(aln.path())) { + if (aln.mapping_quality() < min_mapq || (filter_out_of_graph_alignments && !check_in_graph(aln.path()))) { return; } @@ -140,9 +150,11 @@ void augment_impl(MutablePathMutableHandleGraph* graph, // Add in breakpoints from each path if (packed_mode) { - find_packed_breakpoints(simplified_path, *packer, break_at_ends); + find_packed_breakpoints(simplified_path, *packer, break_at_ends, aln.quality(), min_baseq); } else { - find_breakpoints(simplified_path, breakpoints, break_at_ends); + // note: we cannot pass non-zero min_baseq here. it relies on filter_breakpoints_by_coverage + // to work correctly, and must be passed in only via find_packed_breakpoints. + find_breakpoints(simplified_path, breakpoints, break_at_ends, "", 0); } }, false, packed_mode); @@ -181,7 +193,7 @@ void augment_impl(MutablePathMutableHandleGraph* graph, // Second pass: add the nodes and edges iterate_gam((function)[&](Alignment& aln) { - if (filter_out_of_graph_alignments && !check_in_graph(aln.path())) { + if (aln.mapping_quality() < min_mapq || (filter_out_of_graph_alignments && !check_in_graph(aln.path()))) { return; } @@ -289,8 +301,21 @@ void augment_impl(MutablePathMutableHandleGraph* graph, } +double get_avg_baseq(const Edit& edit, const string& base_quals, size_t position_in_read) { + double avg_qual = numeric_limits::max(); + if (!base_quals.empty() && !edit.sequence().empty() && (edit_is_sub(edit) || edit_is_insertion(edit))) { + double tot_qual = 0; + for (int i = 0; i < edit.sequence().length(); ++i) { + tot_qual += base_quals[position_in_read + i]; + } + avg_qual = tot_qual / (double)edit.sequence().length(); + } + return avg_qual; +} + // returns breakpoints on the forward strand of the nodes -void find_breakpoints(const Path& path, unordered_map>& breakpoints, bool break_ends) { +void find_breakpoints(const Path& path, unordered_map>& breakpoints, bool break_ends, + const string& base_quals, double min_baseq) { // We need to work out what offsets we will need to break each node at, if // we want to add in all the new material and edges in this path. @@ -298,6 +323,9 @@ void find_breakpoints(const Path& path, unordered_map>& breakpo cerr << "Processing path..." << endl; #endif + // The base position in the edit + size_t position_in_read = 0; + for (size_t i = 0; i < path.mapping_size(); ++i) { // For each Mapping in the path const Mapping& m = path.mapping(i); @@ -338,38 +366,41 @@ void find_breakpoints(const Path& path, unordered_map>& breakpo cerr << pb2json(e) << endl; #endif - if (!edit_is_match(e) || (j == 0 && (i != 0 || break_ends))) { - // If this edit is not a perfect match, or if this is the first - // edit in this mapping and either we had a previous mapping we - // may need to connect to or we want to break at the path's - // start, we need to make sure we have a breakpoint at the start - // of this edit. + // Do the base quality check if applicable. If it fails we just ignore the edit + if (min_baseq == 0 || get_avg_baseq(e, base_quals, position_in_read) >= min_baseq) { + + if (!edit_is_match(e) || (j == 0 && (i != 0 || break_ends))) { + // If this edit is not a perfect match, or if this is the first + // edit in this mapping and either we had a previous mapping we + // may need to connect to or we want to break at the path's + // start, we need to make sure we have a breakpoint at the start + // of this edit. #ifdef debug - cerr << "Need to break " << node_id << " at edit lower end " << - edit_first_position << endl; + cerr << "Need to break " << node_id << " at edit lower end " << + edit_first_position << endl; #endif - // We need to snip between edit_first_position and edit_first_position - direction. - // Note that it doesn't matter if we put breakpoints at 0 and 1-past-the-end; those will be ignored. - breakpoints[node_id].insert(edit_first_position); - } + // We need to snip between edit_first_position and edit_first_position - direction. + // Note that it doesn't matter if we put breakpoints at 0 and 1-past-the-end; those will be ignored. + breakpoints[node_id].insert(edit_first_position); + } - if (!edit_is_match(e) || (j == m.edit_size() - 1 && (i != path.mapping_size() - 1 || break_ends))) { - // If this edit is not a perfect match, or if it is the last - // edit in a mapping and we have a subsequent mapping we might - // need to connect to or we want to break at the path ends, make - // sure we have a breakpoint at the end of this edit. + if (!edit_is_match(e) || (j == m.edit_size() - 1 && (i != path.mapping_size() - 1 || break_ends))) { + // If this edit is not a perfect match, or if it is the last + // edit in a mapping and we have a subsequent mapping we might + // need to connect to or we want to break at the path ends, make + // sure we have a breakpoint at the end of this edit. #ifdef debug - cerr << "Need to break " << node_id << " at past edit upper end " << - edit_last_position << endl; + cerr << "Need to break " << node_id << " at past edit upper end " << + edit_last_position << endl; #endif - // We also need to snip between edit_last_position and edit_last_position + direction. - breakpoints[node_id].insert(edit_last_position); + // We also need to snip between edit_last_position and edit_last_position + direction. + breakpoints[node_id].insert(edit_last_position); + } } - // TODO: for an insertion or substitution, note that we need a new // node and two new edges. @@ -380,6 +411,8 @@ void find_breakpoints(const Path& path, unordered_map>& breakpo // Use up the portion of the node taken by this mapping, so we know // where the next mapping will start. edit_first_position = edit_last_position; + + position_in_read += e.to_length(); } } @@ -418,11 +451,12 @@ unordered_map> forwardize_breakpoints(const HandleGraph* graph, // returns breakpoints on the forward strand of the nodes -void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends) { +void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends, + const string& base_quals, double min_baseq) { // use existing methods to find the breakpoints, then copy them into a packer // todo: streamline? unordered_map> breakpoints; - find_breakpoints(path, breakpoints, break_ends); + find_breakpoints(path, breakpoints, break_ends, base_quals, min_baseq); breakpoints = forwardize_breakpoints(packed_breakpoints.get_graph(), breakpoints); const HandleGraph* graph = packed_breakpoints.get_graph(); for (auto& id_set : breakpoints) { @@ -592,7 +626,8 @@ static nid_t find_new_node(HandleGraph* graph, pos_t old_pos, const map& node_translation, - const unordered_map& orig_node_sizes) { + const unordered_map& orig_node_sizes, + const string& base_quals, double min_baseq) { // check if an edit position is chopped at its next or prev position auto is_chopped = [&](pos_t edit_position, bool look_next) { @@ -624,6 +659,9 @@ bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map 0 && get_avg_baseq(e, base_quals, position_in_read) < min_baseq)) { e.set_to_length(e.from_length()); e.set_sequence(""); filtered_an_edit = true; @@ -659,6 +699,8 @@ bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map* out_translation = nullptr, @@ -43,6 +45,8 @@ void augment(MutablePathMutableHandleGraph* graph, bool break_at_ends = false, bool remove_soft_clips = false, bool filter_out_of_graph_alignments = false, + double min_baseq = 0, + double min_mapq = 0, Packer* packer = nullptr, size_t min_bp_coverage = 0); @@ -56,6 +60,8 @@ void augment(MutablePathMutableHandleGraph* graph, bool break_at_ends = false, bool remove_soft_clips = false, bool filter_out_of_graph_alignments = false, + double min_baseq = 0, + double min_mapq = 0, Packer* packer = nullptr, size_t min_bp_coverage = 0); @@ -68,6 +74,8 @@ void augment_impl(MutablePathMutableHandleGraph* graph, bool break_at_ends, bool remove_soft_clips, bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, Packer* packer, size_t min_bp_coverage); @@ -76,6 +84,11 @@ void augment_impl(MutablePathMutableHandleGraph* graph, /// to exist exactly in the graph path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path); +/// Compute the average base quality of an edit. +/// If the edit has no sequence or there are no base_quals given, +/// then double_max is returned. +double get_avg_baseq(const Edit& edit, const string& base_quals, size_t position_in_read); + /// Find all the points at which a Path enters or leaves nodes in the graph. Adds /// them to the given map by node ID of sets of bases in the node that will need /// to become the starts of new nodes. @@ -89,7 +102,8 @@ path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path) /// /// If break_ends is true, emits breakpoints at the ends of the path, even /// if it starts/ends with perfect matches. -void find_breakpoints(const Path& path, unordered_map>& breakpoints, bool break_ends = true); +void find_breakpoints(const Path& path, unordered_map>& breakpoints, bool break_ends = true, + const string& base_quals = "", double min_baseq = 0); /// Flips the breakpoints onto the forward strand. unordered_map> forwardize_breakpoints(const HandleGraph* graph, @@ -97,7 +111,8 @@ unordered_map> forwardize_breakpoints(const HandleGraph* graph, /// Like "find_breakpoints", but store in packed structure (better for large gams and enables coverage filter) -void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends = true); +void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends = true, + const string& base_quals = "", double min_baseq = 0); /// Filters the breakpoints by coverage, and converts them back from the Packer to the STL map /// expected by following methods @@ -121,7 +136,8 @@ map ensure_breakpoints(MutableHandleGraph* graph, /// out due to insufficient coverage. This way, subsequent logic in add_nodes_and_edges /// can be run correctly. Returns true if at least one edit survived the filter. bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map& node_translation, - const unordered_map& orig_node_sizes); + const unordered_map& orig_node_sizes, + const string& base_quals = "", double min_baseq = 0); /// Given a path on nodes that may or may not exist, and a map from start /// position in the old graph to a node in the current graph, add all the diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp index abd0cdaa586..994a65c4050 100644 --- a/src/subcommand/augment_main.cpp +++ b/src/subcommand/augment_main.cpp @@ -51,10 +51,12 @@ void help_augment(char** argv, ConfigurableParser& parser) { << " -s, --subgraph graph is a subgraph of the one used to create GAM. ignore alignments with missing nodes" << endl << " -m, --min-coverage N minimum coverage of a breakpoint required for it to be added to the graph" << endl << " -c, --expected-cov N expected coverage. used only for memory tuning [default : 128]" << endl + << " -q, --min-baseq N ignore edits whose sequence have average base quality < N" << endl + << " -Q, --min-mapq N ignore alignments with mapping quality < N" << endl << " -h, --help print this help message" << endl << " -p, --progress show progress" << endl << " -v, --verbose print information and warnings about vcf generation" << endl - << " -t, --threads N number of threads to use" << endl + << " -t, --threads N number of threads (only 1st pass with -m or -q option is multithreaded)" << endl << "loci file options:" << endl << " -l, --include-loci FILE merge all alleles in loci into the graph" << endl << " -L, --include-gt FILE merge only the alleles in called genotypes into the graph" << endl; @@ -100,6 +102,12 @@ int main_augment(int argc, char** argv) { // Used to set data_width for Packer size_t expected_coverage = 128; + // Minimum average base quality in an edit's sequence for it to be used + double min_baseq = 0; + + // Minimum mapping quality of an alignment for it to be used + double min_mapq = 0; + // Print some progress messages to screen bool show_progress = false; @@ -117,7 +125,9 @@ int main_augment(int argc, char** argv) { {"label-paths", no_argument, 0, 'B'}, {"subgraph", no_argument, 0, 's'}, {"min-coverage", required_argument, 0, 'm'}, - {"expected-cov", required_argument, 0, 'c'}, + {"expected-cov", required_argument, 0, 'c'}, + {"min-baseq", required_argument, 0, 'q'}, + {"min-mapq", required_argument, 0, 'Q'}, {"help", no_argument, 0, 'h'}, {"progress", required_argument, 0, 'p'}, {"verbose", no_argument, 0, 'v'}, @@ -127,7 +137,7 @@ int main_augment(int argc, char** argv) { {"include-gt", required_argument, 0, 'L'}, {0, 0, 0, 0} }; - static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:"; + static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:q:Q:"; optind = 2; // force optind past command positional arguments // This is our command-line parser @@ -163,7 +173,13 @@ int main_augment(int argc, char** argv) { break; case 'c': expected_coverage = parse(optarg); - break; + break; + case 'q': + min_baseq = parse(optarg); + break; + case 'Q': + min_mapq = parse(optarg); + break; case 'h': case '?': /* getopt_long already printed an error message. */ @@ -246,7 +262,9 @@ int main_augment(int argc, char** argv) { HandleGraph* vectorizable_graph = nullptr; unique_ptr packer; bdsg::VectorizableOverlayHelper overlay_helper; - if (min_coverage > 0) { + // the packer's required for any kind of filtering logic -- so we use it when + // baseq is present as well. + if (min_coverage > 0 || min_baseq ) { vectorizable_graph = dynamic_cast(overlay_helper.apply(graph.get())); size_t data_width = Packer::estimate_data_width(expected_coverage); size_t bin_count = Packer::estimate_bin_count(get_thread_count()); @@ -328,6 +346,8 @@ int main_augment(int argc, char** argv) { include_paths, !include_softclips, is_subgraph, + min_baseq, + min_mapq, packer.get(), min_coverage); } else { @@ -341,6 +361,8 @@ int main_augment(int argc, char** argv) { include_paths, !include_softclips, is_subgraph, + min_baseq, + min_mapq, packer.get(), min_coverage); }); diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t index 109927fe822..ed66c2dc910 100644 --- a/test/t/17_vg_augment.t +++ b/test/t/17_vg_augment.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 21 +plan tests 23 vg view -J -v pileup/tiny.json > tiny.vg @@ -96,6 +96,22 @@ vg augment flat.vg 4edits.gam -m 11 | vg view - | grep S | awk '{print $3}' | so diff 2snp_default.nodes 4edits_m11.nodes is "$?" 0 "augmenting 2 snps and 2 errors with -m 11 produces the same nodes as with just the snps" +# 2 snps, but one has a low quality, and one has a high quality +echo "@read" > qual.fq +echo "CAAATAAGGCTTGGAAATTGTCTGGAGTTCTATTATATGCCAACTCTCTG" >> qual.fq +echo "+" >> qual.fq +echo "BBBBBBBBBBBBBBBBBBB+BBBBBBBBBBBBBBBBBBKBBBBBBBBBBB" >> qual.fq +# reverse complement +echo "@daer" >> qual.fq +echo "CAGAGAGTTGGCATATAATAGAACTCCAGACAATTTCCAAGCCTTATTTG" >> qual.fq +echo "+" >> qual.fq +echo "BBBBBBBBBBBKBBBBBBBBBBBBBBBBBB+BBBBBBBBBBBBBBBBBBB" >> qual.fq +vg map -g flat.gcsa -x flat.xg -f qual.fq -k 8 > 2qual.gam +# sanity check: +is $(vg augment flat.vg 2qual.gam -m 2 | vg view - | grep ^S | wc -l) 7 "augmenting with 2snps makes correct number of nodes" +# test quality filter +is $(vg augment flat.vg 2qual.gam -m 2 -q 30 | vg view - | grep ^S | wc -l) 4 "low-quality snp is filtered" + vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort > vg_augment.nodes vg convert flat.vg -p > flat.pg vg augment flat.pg 2snp.gam | vg convert -v - | vg view - | grep S | awk '{print $3}' | sort > packed_graph_augment.nodes @@ -107,4 +123,4 @@ diff vg_augment.nodes hash_graph_augment.nodes is "$?" 0 "augmenting a hash graph produces same results as a vg graph" rm -f flat.vg flat.gcsa flat.xg flat.pg flat.hg 2snp.vg 2snp.xg 2snp.sim 2snp.gam vg_augment.nodes packed_graph_augment.nodes hash_graph_augment.nodes -rm -f 2err.sim 2err.gam 4edits.gam 2snp_default.nodes 2snp_m1.nodes 4edits_m11.nodes +rm -f 2err.sim 2err.gam 4edits.gam 2snp_default.nodes 2snp_m1.nodes 4edits_m11.nodes 2qual.gam qual.fq From 9c0bf18e7e4fedffa322e51e3c2c152cde0a9e4a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 30 Oct 2019 16:39:28 -0700 Subject: [PATCH 10/79] Let distance indexing take any handle graph, and document that -x can feed it an XG --- src/subcommand/index_main.cpp | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 054da2bb431..682c11f380a 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -41,7 +41,7 @@ void help_index(char** argv) { << " -t, --threads N number of threads to use" << endl << " -p, --progress show progress" << endl << "xg options:" << endl - << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA indexing" << endl + << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl << " -L, --xg-alts include alt paths in xg" << endl << "gbwt options:" << endl << " -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl @@ -1234,6 +1234,8 @@ int main_index(int argc, char** argv) { if (file_names.empty() && xg_name.empty()) { cerr << "error: [vg index] one graph is required to build a distance index" << endl; return 1; + } else if (file_names.size() > 1 || (file_names.size() == 1 && !xg_name.empty())) { + cerr << "error: [vg index] only one graph at a time can be used to build a distance index" << endl; } else if (dist_name.empty()) { cerr << "error: [vg index] distance index requires an output file" << endl; return 1; @@ -1253,31 +1255,25 @@ int main_index(int argc, char** argv) { //Get graph and build dist index if (file_names.empty() && !xg_name.empty()) { + // We were given a -x specifically to read as XG ifstream xg_stream(xg_name); auto xg = vg::io::VPKG::load_one(xg_stream); // Create the MinimumDistanceIndex - MinimumDistanceIndex di (xg.get(), snarl_manager); + MinimumDistanceIndex di(xg.get(), snarl_manager); // Save the completed DistanceIndex - ofstream ostream (dist_name); + ofstream ostream(dist_name); di.serialize(ostream); } else { - ifstream vg_stream(file_names.at(0)); - - if (!vg_stream) { - cerr << "error: [vg index] cannot open VG file" << endl; - exit(1); - } - - VG vg(vg_stream); - vg_stream.close(); + // We were given a graph generically + auto graph = vg::io::VPKG::load_one(file_names.at(0)); // Create the MinimumDistanceIndex - MinimumDistanceIndex di (&vg, snarl_manager); + MinimumDistanceIndex di(graph.get(), snarl_manager); // Save the completed DistanceIndex - ofstream ostream (dist_name); + ofstream ostream(dist_name); di.serialize(ostream); // vg::io::VPKG::save(di, dist_name); } From 2c060bce77ea30a0ffcc1e50b11bef3fa84442b9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 31 Oct 2019 09:57:32 -0400 Subject: [PATCH 11/79] update libvgio --- deps/libvgio | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libvgio b/deps/libvgio index 1248a9b6444..fe25b2be07c 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit 1248a9b64440b91ba8431cad8f3aa72f734b5bef +Subproject commit fe25b2be07cbbcf98ed2380213b1fa064e04f68d From eb66165e0dc10fdf20eb01ef592c80b5525b1362 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 31 Oct 2019 16:42:46 -0400 Subject: [PATCH 12/79] start vg depth --- src/algorithms/coverage_depth.cpp | 207 ++++++++++++++++++++++++++++++ src/algorithms/coverage_depth.hpp | 48 +++++++ src/subcommand/depth_main.cpp | 200 +++++++++++++++++++++++++++++ 3 files changed, 455 insertions(+) create mode 100644 src/algorithms/coverage_depth.cpp create mode 100644 src/algorithms/coverage_depth.hpp create mode 100644 src/subcommand/depth_main.cpp diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp new file mode 100644 index 00000000000..e848161124f --- /dev/null +++ b/src/algorithms/coverage_depth.cpp @@ -0,0 +1,207 @@ +#include "coverage_depth.hpp" +#include +#include "algorithms/subgraph.hpp" +#include +#include "../path.hpp" + +namespace vg { +namespace algorithms { + +/// Estimate the depth of coverage of a given (sub) graph using the packer +/// Coverage is computed relative to the given path +double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path) { + + // get the path length + path_handle_t path_handle = graph.get_path_handle(ref_path); + size_t path_len = 0; + for (handle_t handle : graph.scan_path(path_handle)) { + path_len += graph.get_length(handle); + } + if (path_len == 0) { + return 0; + } + + // sum up the coverage + size_t tot_base_coverage = 0; + graph.for_each_handle([&] (handle_t handle) { + Position pos; + pos.set_node_id(graph.get_id(handle)); + size_t packer_pos = packer.position_in_basis(pos); + size_t node_len = graph.get_length(handle); + for (size_t offset = 0; offset < node_len; ++offset) { + tot_base_coverage += packer.coverage_at_position(packer_pos + offset); + } + }); + + // return average (over the path) + return (double)tot_base_coverage / (double)path_len; +} + + +/// Estimate the binned coverage along a path +map binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path, + size_t step, size_t context, size_t threads) { + + // move forward along path (note: this can be sped up if we're given a PathPositionHandleGraph but I don't think + // it matters for a couple of scans. + function advance = [&] (step_handle_t& step_handle, size_t distance) { + size_t went = 0; + for (; graph.has_next_step(step_handle) && went < distance; step_handle = graph.get_next_step(step_handle)) { + went += graph.get_length(graph.get_handle_of_step(step_handle)); + } + return went; + }; + + path_handle_t path_handle = graph.get_path_handle(ref_path); + step_handle_t step_handle = graph.path_begin(path_handle); + + // hop along the graph, grabbing a step handle every "step" bases (or thereabouts) + vector> bin_centers; + size_t pos = advance(step_handle, step / 2); + if (pos >= step / 2) { + size_t went; + do { + if (bin_centers.empty() || step_handle != bin_centers.back().second) { + bin_centers.push_back(make_pair(pos, step_handle)); + } + went = advance(step_handle, step); + pos += went; + } while (went >= step); + } + + // our graph's too small to do any stepping, just use the first handle + if (bin_centers.empty()) { + bin_centers.push_back(make_pair(0, graph.path_begin(path_handle))); + } + + // visit every bin center and make a subgraph to collect coverage from + if (threads == 0) { + threads = get_thread_count(); + } + map binned_depths; +#pragma omp parallel for num_threads(threads) + for (size_t i = 0; i < bin_centers.size(); ++i) { + // extract the subgraph + bdsg::HashGraph subgraph; + step_handle_t bin_step = bin_centers[i].second; + handle_t bin_handle = graph.get_handle_of_step(bin_step); + assert(graph.get_is_reverse(bin_handle) == false); + subgraph.create_handle(graph.get_sequence(bin_handle), graph.get_id(bin_handle)); + expand_subgraph_by_steps(graph, subgraph, context); + + // sum up the coverage on the subgraph + size_t tot_base_coverage = 0; + size_t tot_ref_len = 0; + subgraph.for_each_handle([&] (handle_t sub_handle) { + // go back into the original graph because we don't have any + // path information in the subgraph because we are unable + // to get it without requiring the path position interface + handle_t orig_handle = graph.get_handle(subgraph.get_id(sub_handle)); + Position pos; + pos.set_node_id(graph.get_id(orig_handle)); + size_t packer_pos = packer.position_in_basis(pos); + size_t node_len = graph.get_length(orig_handle); + for (size_t offset = 0; offset < node_len; ++offset) { + tot_base_coverage += packer.coverage_at_position(packer_pos + offset); + } + // we manually test if each handle is on our reference path (again, to + // not require path position interface) + vector step_path_handles = graph.steps_of_handle(orig_handle); + bool on_ref = false; + for (size_t j = 0; j < step_path_handles.size() && !on_ref; ++j) { + on_ref = graph.get_path_handle_of_step(step_path_handles[j]) == path_handle; + } + if (on_ref) { + tot_ref_len += node_len; + } + }); + + assert(tot_ref_len > 0); + double avg_base_coverage = tot_base_coverage / tot_ref_len; + +#pragma omp critical (update_binned_depth) + binned_depths[bin_centers[i].first] = avg_base_coverage; + } + + return binned_depths; + +} + +// draw (roughly) max_nodes nodes from the graph using the random seed +static unordered_map sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) { + default_random_engine generator(random_seed); + uniform_real_distribution distribution(0, 1); + double cutoff = std::min((double)1.0, (double)(max_nodes / graph.get_node_count())); + unordered_map sampled_nodes; + graph.for_each_handle([&](handle_t handle) { + if (cutoff == 1 || cutoff < distribution(generator)) { + sampled_nodes[graph.get_id(handle)] = 0; + } + }); + return sampled_nodes; +} + +// update the coverage from an alignment. only count nodes that are in the map already +static void update_sample_gam_depth(const Alignment& aln, unordered_map& node_coverage) { + const Path& path = aln.path(); + for (int i = 0; i < path.mapping_size(); ++i) { + const Mapping& mapping = path.mapping(i); + nid_t node_id = mapping.position().node_id(); + if (node_coverage.count(node_id)) { + ++node_coverage[node_id]; + } + } +} + +// sum up the results from the different threads and return the average. +// if a min_coverage is given, nodes with less coverage are ignored +static double combine_and_average_node_coverages(vector>& node_coverages, size_t min_coverage) { + for (int i = 1; i < node_coverages.size(); ++i) { + for (const auto& node_cov : node_coverages[i]) { + node_coverages[0][node_cov.first] += node_cov.second; + } + } + size_t tot_coverage = 0; + size_t tot_count = 0; + for (const auto & node_cov : node_coverages[0]) { + if (node_cov.second >= min_coverage) { + tot_coverage += node_cov.second; + ++tot_count; + } + } + + return tot_count > 0 ? (double)tot_coverage / (double)tot_count : 0; +} + + +double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage) { + // one node counter per thread + vector> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed)); + + function aln_callback = [&](Alignment& aln) { + update_sample_gam_depth(aln, node_coverages[omp_get_thread_num()]); + }; + vg::io::for_each_parallel(gam_stream, aln_callback); + return combine_and_average_node_coverages(node_coverages, min_coverage); +} + +double sample_gam_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage) { + // one node counter per thread + vector> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed)); + +#pragma omp parallel for + for (size_t i = 0; i < alignments.size(); ++i) { + update_sample_gam_depth(alignments[i], node_coverages[omp_get_thread_num()]); + } + return combine_and_average_node_coverages(node_coverages, min_coverage); +} + + + +} + + + + +} + diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp new file mode 100644 index 00000000000..b7ffb645a4a --- /dev/null +++ b/src/algorithms/coverage_depth.hpp @@ -0,0 +1,48 @@ +#ifndef VG_DEPTH_HPP_INCLUDED +#define VG_DEPTH_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include "handle.hpp" +#include "packer.hpp" + + +namespace vg { +namespace algorithms { + +using namespace std; + +/// Estimate the depth of coverage of a given (sub) graph using the packer +/// Coverage is computed relative to the given path +double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path); + +/// Estimate the binned coverage along a path using the packer +/// ref_path is scanned, and every "step" bases as subgraph is extracted using the given number of context steps +/// If threads is 0, all the threads are used +map binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path, + size_t step, size_t context, size_t threads = 0); + + +/// Get the depth of a bin +/// the "k_nearest" closest bins to the given position are used +/// bins with coverage below min_coverage are ignored +double get_binned_depth(const unordered_map& binned_depths, size_t pos, size_t k_nearest = 3, double min_coverage = 1.0); + +/// Return the average depth of coverage of randomly sampled nodes from a GAM +/// Nodes with less than min_coverage are ignored +/// The stream is scanned in parallel with all threads +/// max_nodes is used to keep memory down +double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0); + +/// As above, but read a vector instead of a stream +double sample_gam_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0); + +} +} + +#endif diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp new file mode 100644 index 00000000000..7a892397a3e --- /dev/null +++ b/src/subcommand/depth_main.cpp @@ -0,0 +1,200 @@ +/** \file depth_main.cpp + * + * Estimate sequencing depth from a (packed) alignment. + */ + + +#include +#include +#include + +#include +#include + +#include "subcommand.hpp" + +#include +#include +#include "../handle.hpp" +#include +#include "../utility.hpp" +#include "../packer.hpp" +#include "algorithms/coverage_depth.hpp" + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_depth(char** argv) { + cerr << "usage: " << argv[0] << " depth [options] " << endl + << "options:" << endl + << " packed coverage depth:" << endl + << " -k, --pack FILE Supports created from vg pack for given input graph" << endl + << " -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths)" << endl + << " -c, --context-size N Context size (steps) for expanding bin subgraphs [50]" << endl + << " -b, --bin-size N Bin size (in bases) [10000000]" << endl + << " GAM coverage depth:" << endl + << " -g, --gam FILE read alignments from this file (could be '-' for stdin)" << endl + << " -n, --max-nodes N maximum nodes to consider [1000000]" << endl + << " -s, --random-seed N random seed for sampling nodes to consider" << endl + << " common options:" << endl + << " -m, --min-coverage N ignore nodes with less than N coverage [1]" << endl + << " -t, --threads N Number of threads to use [all available]" << endl; +} + +int main_depth(int argc, char** argv) { + + if (argc == 2) { + help_depth(argv); + return 1; + } + + string pack_filename; + vector ref_paths; + size_t context_steps = 50; + size_t bin_size = 10000000; + + string gam_filename; + size_t max_nodes = 1000000; + int random_seed = time(NULL); + + size_t min_coverage = 1; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + + static const struct option long_options[] = { + {"pack", required_argument, 0, 'k'}, + {"ref-path", required_argument, 0, 'p'}, + {"context-size", required_argument, 0, 'c'}, + {"bin-size", required_argument, 0, 'b'}, + {"gam", required_argument, 0, 'g'}, + {"max-nodes", required_argument, 0, 'n'}, + {"random-seed", required_argument, 0, 's'}, + {"min-coverage", required_argument, 0, 'm'}, + {"threads", required_argument, 0, 't'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "hk:p:c:b:g:n:s:m:t:", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'k': + pack_filename = optarg; + break; + case 'p': + ref_paths.push_back(optarg); + break; + case 'c': + context_steps = parse(optarg); + break; + case 'b': + bin_size = parse(optarg); + break; + case 'g': + gam_filename = optarg; + break; + case 'n': + max_nodes = parse(optarg); + break; + case 's': + random_seed = parse(optarg); + break; + case 'm': + min_coverage = parse(optarg); + break; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg depth] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + case 'h': + case '?': + /* getopt_long already printed an error message. */ + help_depth(argv); + exit(1); + break; + default: + abort (); + } + } + + if (argc <= 2) { + help_depth(argv); + return 1; + } + + if (pack_filename.empty() == gam_filename.empty() ) { + cerr << "error:[vg depth] Either a pack file (-k) or a gam file (-g) must be given" << endl; + exit(1); + } + + // Read the graph + unique_ptr path_handle_graph; + get_input_file(optind, argc, argv, [&](istream& in) { + path_handle_graph = vg::io::VPKG::load_one(in); + }); + PathHandleGraph* graph = path_handle_graph.get(); + + // Apply the overlay if necessary + bdsg::PathVectorizableOverlayHelper overlay_helper; + if (!pack_filename.empty()) { + graph = dynamic_cast(overlay_helper.apply(path_handle_graph.get())); + assert(graph != nullptr); + } + + // Process the pack + unique_ptr packer; + if (!pack_filename.empty()) { + // Load our packed supports (they must have come from vg pack on graph) + packer = unique_ptr(new Packer(graph)); + packer->load_from_file(pack_filename); + + // All paths if none given + if (ref_paths.empty()) { + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (!Paths::is_alt(path_name)) { + ref_paths.push_back(path_name); + } + }); + } + + for (const string& ref_path : ref_paths) { + map binned_depth = algorithms::binned_packed_depth(*graph, *packer, ref_path, bin_size, get_thread_count()); + for (auto& bin_cov : binned_depth) { + cerr << ref_path << "\t" << bin_cov.first << "\t" << bin_cov.second << endl; + } + } + } + + // Process the gam + if (!gam_filename.empty()) { + double gam_cov; + get_input_file(gam_filename, [&] (istream& gam_stream) { + gam_cov = algorithms::sample_gam_depth(*graph, gam_stream, max_nodes, random_seed, min_coverage); + }); + cerr << "gam-coverage\t" << gam_cov << endl; + } + + return 0; + +} + +// Register subcommand +static Subcommand vg_depth("depth", "estimate sequencing depth", main_depth); + From a50d14f90c0461b58ff0a7abc641ed16dc0d2f12 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 1 Nov 2019 11:52:46 -0400 Subject: [PATCH 13/79] more sensible path coverage. include variance --- src/algorithms/coverage_depth.cpp | 249 ++++++++++++++++-------------- src/algorithms/coverage_depth.hpp | 40 ++--- src/subcommand/depth_main.cpp | 65 +++++--- src/utility.cpp | 18 +++ src/utility.hpp | 3 + 5 files changed, 213 insertions(+), 162 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index e848161124f..351ed63e1f0 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -7,134 +7,136 @@ namespace vg { namespace algorithms { -/// Estimate the depth of coverage of a given (sub) graph using the packer -/// Coverage is computed relative to the given path -double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path) { - - // get the path length - path_handle_t path_handle = graph.get_path_handle(ref_path); - size_t path_len = 0; - for (handle_t handle : graph.scan_path(path_handle)) { - path_len += graph.get_length(handle); - } - if (path_len == 0) { - return 0; +void packed_depths(const Packer& packer, const string& path_name, size_t min_coverage, ostream& out_stream) { + const PathHandleGraph& graph = dynamic_cast(*packer.get_graph()); + path_handle_t path_handle = graph.get_path_handle(path_name); + step_handle_t start_step = graph.path_begin(path_handle); + step_handle_t end_step = graph.path_end(path_handle); + Position cur_pos; + size_t path_offset = 1; + for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + nid_t cur_id = graph.get_id(cur_handle); + size_t cur_len = graph.get_length(cur_handle); + cur_pos.set_node_id(cur_id); + cur_pos.set_is_reverse(graph.get_is_reverse(cur_handle)); + for (size_t i = 0; i < cur_len; ++i) { + cur_pos.set_offset(i); + size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos)); + out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n"; + ++path_offset; + } } - - // sum up the coverage - size_t tot_base_coverage = 0; - graph.for_each_handle([&] (handle_t handle) { - Position pos; - pos.set_node_id(graph.get_id(handle)); - size_t packer_pos = packer.position_in_basis(pos); - size_t node_len = graph.get_length(handle); - for (size_t offset = 0; offset < node_len; ++offset) { - tot_base_coverage += packer.coverage_at_position(packer_pos + offset); - } - }); - - // return average (over the path) - return (double)tot_base_coverage / (double)path_len; } - -/// Estimate the binned coverage along a path -map binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path, - size_t step, size_t context, size_t threads) { - - // move forward along path (note: this can be sped up if we're given a PathPositionHandleGraph but I don't think - // it matters for a couple of scans. - function advance = [&] (step_handle_t& step_handle, size_t distance) { - size_t went = 0; - for (; graph.has_next_step(step_handle) && went < distance; step_handle = graph.get_next_step(step_handle)) { - went += graph.get_length(graph.get_handle_of_step(step_handle)); +pair packed_depth_of_bin(const Packer& packer, + step_handle_t start_step, step_handle_t end_plus_one_step, + size_t min_coverage, bool include_deletions) { + + const PathHandleGraph& graph = dynamic_cast(*packer.get_graph()); + + // coverage of each node via deletion (that's contained in the bin) + unordered_map deletion_coverages; + if (include_deletions) { + const VectorizableHandleGraph* vec_graph = dynamic_cast(packer.get_graph()); + unordered_map deletion_candidates; + handle_t prev_handle; + for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + graph.follow_edges(cur_handle, true, [&] (handle_t other) { + if (!deletion_candidates.empty() && other!= prev_handle && deletion_candidates.count(other)) { + edge_t edge = graph.edge_handle(other, cur_handle); + size_t edge_pos = vec_graph->edge_index(edge); + size_t deletion_coverage = packer.edge_coverage(edge_pos); + // quadratic alert. if this is too slow, can use interval tree or something + for (step_handle_t del_step = graph.get_next_step(deletion_candidates[other]); + del_step != cur_step; + del_step = graph.get_next_step(del_step)) { + handle_t del_handle = graph.get_handle_of_step(del_step); + nid_t del_id = graph.get_id(del_handle); + if (!deletion_coverages.count(del_id)) { + deletion_coverages[del_id] = deletion_coverage; + } else { + deletion_coverages[del_id] += deletion_coverage; + } + } + } + }); + prev_handle = cur_handle; + deletion_candidates[cur_handle] = cur_step; } - return went; - }; + } - path_handle_t path_handle = graph.get_path_handle(ref_path); - step_handle_t step_handle = graph.path_begin(path_handle); - - // hop along the graph, grabbing a step handle every "step" bases (or thereabouts) - vector> bin_centers; - size_t pos = advance(step_handle, step / 2); - if (pos >= step / 2) { - size_t went; - do { - if (bin_centers.empty() || step_handle != bin_centers.back().second) { - bin_centers.push_back(make_pair(pos, step_handle)); + // compute the mean and variance of our base coverage across the bin + size_t bin_length = 0; + double mean = 0.0; + double M2 = 0.0; + + for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + nid_t cur_id = graph.get_id(cur_handle); + size_t cur_len = graph.get_length(cur_handle); + size_t del_coverage = !include_deletions or !deletion_coverages.count(cur_id) ? 0 : deletion_coverages[cur_id]; + Position cur_pos; + cur_pos.set_node_id(cur_id); + cur_pos.set_is_reverse(graph.get_is_reverse(cur_handle)); + for (size_t i = 0; i < cur_len; ++i) { + cur_pos.set_offset(i); + size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos)) + del_coverage; + if (pos_coverage >= min_coverage) { + wellford_update(bin_length, mean, M2, pos_coverage); } - went = advance(step_handle, step); - pos += went; - } while (went >= step); + } } + return wellford_mean_var(bin_length, mean, M2, true); +} - // our graph's too small to do any stepping, just use the first handle - if (bin_centers.empty()) { - bin_centers.push_back(make_pair(0, graph.path_begin(path_handle))); +vector> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size, + size_t min_coverage, bool include_deletions) { + + const PathHandleGraph& graph = dynamic_cast(*packer.get_graph()); + path_handle_t path_handle = graph.get_path_handle(path_name); + + // one scan of our path to collect the bins + step_handle_t start_step = graph.path_begin(path_handle); + step_handle_t end_step = graph.path_end(path_handle); + vector> bins; // start offset / start step of each bin + size_t offset = 0; + size_t cur_bin_size = bin_size; + for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) { + if (cur_bin_size >= bin_size) { + bins.push_back(make_pair(offset, cur_step)); + cur_bin_size = 0; + } + size_t node_len = graph.get_length(graph.get_handle_of_step(cur_step)); + offset += node_len; + cur_bin_size += node_len; } - // visit every bin center and make a subgraph to collect coverage from - if (threads == 0) { - threads = get_thread_count(); - } - map binned_depths; -#pragma omp parallel for num_threads(threads) - for (size_t i = 0; i < bin_centers.size(); ++i) { - // extract the subgraph - bdsg::HashGraph subgraph; - step_handle_t bin_step = bin_centers[i].second; - handle_t bin_handle = graph.get_handle_of_step(bin_step); - assert(graph.get_is_reverse(bin_handle) == false); - subgraph.create_handle(graph.get_sequence(bin_handle), graph.get_id(bin_handle)); - expand_subgraph_by_steps(graph, subgraph, context); - - // sum up the coverage on the subgraph - size_t tot_base_coverage = 0; - size_t tot_ref_len = 0; - subgraph.for_each_handle([&] (handle_t sub_handle) { - // go back into the original graph because we don't have any - // path information in the subgraph because we are unable - // to get it without requiring the path position interface - handle_t orig_handle = graph.get_handle(subgraph.get_id(sub_handle)); - Position pos; - pos.set_node_id(graph.get_id(orig_handle)); - size_t packer_pos = packer.position_in_basis(pos); - size_t node_len = graph.get_length(orig_handle); - for (size_t offset = 0; offset < node_len; ++offset) { - tot_base_coverage += packer.coverage_at_position(packer_pos + offset); - } - // we manually test if each handle is on our reference path (again, to - // not require path position interface) - vector step_path_handles = graph.steps_of_handle(orig_handle); - bool on_ref = false; - for (size_t j = 0; j < step_path_handles.size() && !on_ref; ++j) { - on_ref = graph.get_path_handle_of_step(step_path_handles[j]) == path_handle; - } - if (on_ref) { - tot_ref_len += node_len; - } - }); - - assert(tot_ref_len > 0); - double avg_base_coverage = tot_base_coverage / tot_ref_len; - -#pragma omp critical (update_binned_depth) - binned_depths[bin_centers[i].first] = avg_base_coverage; + // parallel scan to compute the coverages + vector> binned_depths(bins.size()); +#pragma omp parallel for + for (size_t i = 0; i < bins.size(); ++i) { + step_handle_t bin_start_step = bins[i].second; + step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step; + size_t bin_start = bins[i].first; + size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset; + pair coverage = packed_depth_of_bin(packer, bin_start_step, bin_end_step, min_coverage, include_deletions); + binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second); } return binned_depths; - } + // draw (roughly) max_nodes nodes from the graph using the random seed static unordered_map sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) { default_random_engine generator(random_seed); uniform_real_distribution distribution(0, 1); - double cutoff = std::min((double)1.0, (double)(max_nodes / graph.get_node_count())); + double cutoff = std::min((double)1.0, (double)max_nodes / (double)graph.get_node_count()); unordered_map sampled_nodes; graph.for_each_handle([&](handle_t handle) { - if (cutoff == 1 || cutoff < distribution(generator)) { + if (cutoff == 1. || cutoff <= distribution(generator)) { sampled_nodes[graph.get_id(handle)] = 0; } }); @@ -148,52 +150,59 @@ static void update_sample_gam_depth(const Alignment& aln, unordered_map>& node_coverages, size_t min_coverage) { +static pair combine_and_average_node_coverages(const HandleGraph& graph, vector>& node_coverages, size_t min_coverage) { for (int i = 1; i < node_coverages.size(); ++i) { for (const auto& node_cov : node_coverages[i]) { node_coverages[0][node_cov.first] += node_cov.second; } } - size_t tot_coverage = 0; - size_t tot_count = 0; + size_t count = 0; + double mean = 0.; + double M2 = 0.; for (const auto & node_cov : node_coverages[0]) { if (node_cov.second >= min_coverage) { - tot_coverage += node_cov.second; - ++tot_count; + // we normalize the bases covered by the node length as we sum + double node_len = graph.get_length(graph.get_handle(node_cov.first)); + wellford_update(count, mean, M2, (double)node_cov.second / node_len); } } - return tot_count > 0 ? (double)tot_coverage / (double)tot_count : 0; + return wellford_mean_var(count, mean, M2, count < graph.get_node_count()); } -double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage) { +pair sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq) { // one node counter per thread vector> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed)); function aln_callback = [&](Alignment& aln) { - update_sample_gam_depth(aln, node_coverages[omp_get_thread_num()]); + if (aln.mapping_quality() >= min_mapq) { + update_sample_gam_depth(aln, node_coverages[omp_get_thread_num()]); + } }; vg::io::for_each_parallel(gam_stream, aln_callback); - return combine_and_average_node_coverages(node_coverages, min_coverage); + return combine_and_average_node_coverages(graph, node_coverages, min_coverage); } -double sample_gam_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage) { +pair sample_gam_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq) { // one node counter per thread vector> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed)); #pragma omp parallel for for (size_t i = 0; i < alignments.size(); ++i) { - update_sample_gam_depth(alignments[i], node_coverages[omp_get_thread_num()]); + if (alignments[i].mapping_quality() >= min_mapq) { + update_sample_gam_depth(alignments[i], node_coverages[omp_get_thread_num()]); + } } - return combine_and_average_node_coverages(node_coverages, min_coverage); + return combine_and_average_node_coverages(graph, node_coverages, min_coverage); } diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp index b7ffb645a4a..18d70daa26a 100644 --- a/src/algorithms/coverage_depth.hpp +++ b/src/algorithms/coverage_depth.hpp @@ -17,30 +17,32 @@ namespace algorithms { using namespace std; -/// Estimate the depth of coverage of a given (sub) graph using the packer -/// Coverage is computed relative to the given path -double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path); - -/// Estimate the binned coverage along a path using the packer -/// ref_path is scanned, and every "step" bases as subgraph is extracted using the given number of context steps -/// If threads is 0, all the threads are used -map binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path, - size_t step, size_t context, size_t threads = 0); - - -/// Get the depth of a bin -/// the "k_nearest" closest bins to the given position are used -/// bins with coverage below min_coverage are ignored -double get_binned_depth(const unordered_map& binned_depths, size_t pos, size_t k_nearest = 3, double min_coverage = 1.0); - -/// Return the average depth of coverage of randomly sampled nodes from a GAM +/// print path-name offset base-coverage for every base on a path (just like samtools depth) +/// ignoring things below min_coverage. offsets are 1-based in output stream +void packed_depths(const Packer& packer, const string& path_name, size_t min_coverage, ostream& out_stream); + +/// Estimate the coverage along a given reference path interval [start_step, end_plus_one_step) +/// Coverage is obtained only from positions along the path, and variation is not counted +/// Except if "include_deletions" is true, then reference path positions covered by a deletion edge +/// (which is contained in the bin) will get the deletion edge's coverage counted. +/// Other types of events (such as SNPs) can throw off coverage in similar ways but deletions tend to be bigger +/// (and easier to find), so we hope that counting them is enough. +pair packed_depth_of_bin(const Packer& packer, step_handle_t start_step, step_handle_t end_plus_one_step, + size_t min_coverage, bool include_deletions); + +/// Use all available threads to estimate the binned packed coverage of a path using above fucntion +/// Each element is a bin's 0-based open-ended interval in the path, and its coverage mean,variance. +vector> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size, + size_t min_coverage, bool include_deletions); + +/// Return the mean and variance of coverage of randomly sampled nodes from a GAM /// Nodes with less than min_coverage are ignored /// The stream is scanned in parallel with all threads /// max_nodes is used to keep memory down -double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0); +pair sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq); /// As above, but read a vector instead of a stream -double sample_gam_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0); +pair sample_gam_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq); } } diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp index 7a892397a3e..0e5fd75b1e4 100644 --- a/src/subcommand/depth_main.cpp +++ b/src/subcommand/depth_main.cpp @@ -28,15 +28,16 @@ using namespace vg::subcommand; void help_depth(char** argv) { cerr << "usage: " << argv[0] << " depth [options] " << endl << "options:" << endl - << " packed coverage depth:" << endl - << " -k, --pack FILE Supports created from vg pack for given input graph" << endl - << " -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths)" << endl - << " -c, --context-size N Context size (steps) for expanding bin subgraphs [50]" << endl - << " -b, --bin-size N Bin size (in bases) [10000000]" << endl - << " GAM coverage depth:" << endl - << " -g, --gam FILE read alignments from this file (could be '-' for stdin)" << endl - << " -n, --max-nodes N maximum nodes to consider [1000000]" << endl - << " -s, --random-seed N random seed for sampling nodes to consider" << endl + << " packed coverage depth (print positional depths along path):" << endl + << " -k, --pack FILE Supports created from vg pack for given input graph" << endl + << " -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths)" << endl + << " -b, --bin-size N Bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl + << " -d, --count-dels Count deletion edges within the bin as covering reference positions" << endl + << " GAM coverage depth (print for depth):" << endl + << " -g, --gam FILE read alignments from this file (could be '-' for stdin)" << endl + << " -n, --max-nodes N maximum nodes to consider [1000000]" << endl + << " -s, --random-seed N random seed for sampling nodes to consider" << endl + << " -Q, --min-mapq N ignore alignments with mapping quality < N" << endl << " common options:" << endl << " -m, --min-coverage N ignore nodes with less than N coverage [1]" << endl << " -t, --threads N Number of threads to use [all available]" << endl; @@ -51,12 +52,13 @@ int main_depth(int argc, char** argv) { string pack_filename; vector ref_paths; - size_t context_steps = 50; - size_t bin_size = 10000000; + size_t bin_size = 1; + bool count_dels = false; string gam_filename; size_t max_nodes = 1000000; int random_seed = time(NULL); + size_t min_mapq = 0; size_t min_coverage = 1; @@ -67,11 +69,12 @@ int main_depth(int argc, char** argv) { static const struct option long_options[] = { {"pack", required_argument, 0, 'k'}, {"ref-path", required_argument, 0, 'p'}, - {"context-size", required_argument, 0, 'c'}, {"bin-size", required_argument, 0, 'b'}, + {"count-dels", no_argument, 0, 'd'}, {"gam", required_argument, 0, 'g'}, {"max-nodes", required_argument, 0, 'n'}, {"random-seed", required_argument, 0, 's'}, + {"min-mapq", required_argument, 0, 'Q'}, {"min-coverage", required_argument, 0, 'm'}, {"threads", required_argument, 0, 't'}, {"help", no_argument, 0, 'h'}, @@ -79,7 +82,7 @@ int main_depth(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hk:p:c:b:g:n:s:m:t:", + c = getopt_long (argc, argv, "hk:p:c:b:dg:n:s:m:t:", long_options, &option_index); // Detect the end of the options. @@ -94,12 +97,12 @@ int main_depth(int argc, char** argv) { case 'p': ref_paths.push_back(optarg); break; - case 'c': - context_steps = parse(optarg); - break; case 'b': bin_size = parse(optarg); break; + case 'd': + count_dels = true; + break; case 'g': gam_filename = optarg; break; @@ -109,6 +112,9 @@ int main_depth(int argc, char** argv) { case 's': random_seed = parse(optarg); break; + case 'Q': + min_mapq = parse(optarg); + break; case 'm': min_coverage = parse(optarg); break; @@ -172,25 +178,38 @@ int main_depth(int argc, char** argv) { ref_paths.push_back(path_name); } }); + } else { + for (const string& ref_name : ref_paths) { + if (!graph->has_path(ref_name)) { + cerr << "error:[vg depth] Path \"" << ref_name << "\" not found in graph" << endl; + } + } } + for (const string& ref_path : ref_paths) { - map binned_depth = algorithms::binned_packed_depth(*graph, *packer, ref_path, bin_size, get_thread_count()); - for (auto& bin_cov : binned_depth) { - cerr << ref_path << "\t" << bin_cov.first << "\t" << bin_cov.second << endl; + if (bin_size > 1) { + vector> binned_depth = + algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels); + for (auto& bin_cov : binned_depth) { + cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov) + << "\t" << sqrt(get<3>(bin_cov)) << endl; + } + } else { + algorithms::packed_depths(*packer, ref_path, min_coverage, cout); } } } // Process the gam if (!gam_filename.empty()) { - double gam_cov; + pair gam_cov; get_input_file(gam_filename, [&] (istream& gam_stream) { - gam_cov = algorithms::sample_gam_depth(*graph, gam_stream, max_nodes, random_seed, min_coverage); + gam_cov = algorithms::sample_gam_depth(*graph, gam_stream, max_nodes, random_seed, min_coverage, min_mapq); }); - cerr << "gam-coverage\t" << gam_cov << endl; + cout << gam_cov.first << "\t" << sqrt(gam_cov.second) << endl; } - + return 0; } diff --git a/src/utility.cpp b/src/utility.cpp index a9f72f12cf5..ee6c71ef205 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -336,6 +336,24 @@ double median(std::vector &v) { return 0.5*(vn+v[n-1]); } } + +// from Python exmaple here: +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm +void wellford_update(size_t& count, double& mean, double& M2, double new_val) { + ++count; + double delta = new_val - mean; + mean += delta / (double)count; + double delta2 = new_val - mean; + M2 += delta * delta2; +} + +pair wellford_mean_var(size_t count, double mean, double M2, bool sample_variance) { + if (count == 0 || (sample_variance && count == 1)) { + return make_pair(nan(""), nan("")); + } else { + return make_pair(mean, M2 / (double)(sample_variance ? count - 1 : count)); + } +} vector range_vector(size_t begin, size_t end) { size_t len = end - begin; diff --git a/src/utility.hpp b/src/utility.hpp index 72c04c3144a..d3823b6c05b 100644 --- a/src/utility.hpp +++ b/src/utility.hpp @@ -51,6 +51,9 @@ string nonATGCNtoN(const string& s); string toUppercase(const string& s); double median(std::vector &v); double stdev(const std::vector& v); +// Online mean-variance computation with wellfords algorithm (pass 0's to 1st 3 params to start) +void wellford_update(size_t& count, double& mean, double& M2, double new_val); +pair wellford_mean_var(size_t count, double mean, double M2, bool sample_variance = false); // write a fasta sqeuence void write_fasta_sequence(const std::string& name, const std::string& sequence, ostream& os, size_t width=80); From 4b664f53c35e5f3baa10aa1e7c28007c2b86ccf2 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 1 Nov 2019 16:57:21 -0700 Subject: [PATCH 14/79] Turn off debug output but leave improved provenance --- src/minimizer_mapper.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index 7cfe7bd0ce6..1b6079be4a5 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -15,7 +15,6 @@ #include #include - namespace vg { using namespace std; @@ -31,6 +30,10 @@ MinimizerMapper::MinimizerMapper(const gbwtgraph::GBWTGraph& graph, const gbwtgr void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { // For each input alignment + +#ifdef debug + cerr << "Read " << aln.name() << ": " << aln.sequence() << endl; +#endif // Make a new funnel instrumenter to watch us map this read. Funnel funnel; @@ -109,7 +112,17 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { // of the selected minimizers is not high enough. size_t hits = minimizer_index.count(minimizers[minimizer_num]); - if (hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) { +#ifdef debug + cerr << "Minimizer " << minimizer_num << " = " << minimizers[minimizer_num].key.decode(minimizer_index.k()) + << " has " << hits << " hits" << endl; +#endif + + if (hits == 0) { + // A minimizer with no hits can't go on. + if (track_provenance) { + funnel.fail("any-hits", minimizer_num); + } + } else if (hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) { // Locate the hits. for (auto& hit : minimizer_index.find(minimizers[minimizer_num])) { // Reverse the hits for a reverse minimizer @@ -125,6 +138,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { if (track_provenance) { // Record in the funnel that this minimizer gave rise to these seeds. + funnel.pass("any-hits", minimizer_num); funnel.pass("hard-hit-cap", minimizer_num); funnel.pass("hit-cap||score-fraction", minimizer_num, selected_score / base_target_score); funnel.expand(minimizer_num, hits); @@ -133,6 +147,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { // Passed hard hit cap but failed score fraction/normal hit cap rejected_count++; if (track_provenance) { + funnel.pass("any-hits", minimizer_num); funnel.pass("hard-hit-cap", minimizer_num); funnel.fail("hit-cap||score-fraction", minimizer_num, (selected_score + minimizer_score[minimizer_num]) / base_target_score); } @@ -140,6 +155,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { // Failed hard hit cap rejected_count++; if (track_provenance) { + funnel.pass("any-hits", minimizer_num); funnel.fail("hard-hit-cap", minimizer_num); } } @@ -177,7 +193,6 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { } #ifdef debug - cerr << "Read " << aln.name() << ": " << aln.sequence() << endl; cerr << "Found " << seeds.size() << " seeds from " << (minimizers.size() - rejected_count) << " minimizers, rejected " << rejected_count << endl; #endif @@ -304,7 +319,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { vector& cluster = clusters[cluster_num]; #ifdef debug - cerr << "Cluster " << cluster_num << " rank " << i << ": " << endl; + cerr << "Cluster " << cluster_num << endl; #endif // Pack the seeds for GaplessExtender. From 5e984558fac66ef32553ffd7ad098428d956290b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2019 11:20:29 -0800 Subject: [PATCH 15/79] Make Giraffe Wrangler do vg stats to compare map and giraffe --- scripts/giraffe-wrangler.sh | 188 +++++++++++++++++++++++++----------- 1 file changed, 129 insertions(+), 59 deletions(-) diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh index 9bce06a9bad..aeb2ce6d275 100755 --- a/scripts/giraffe-wrangler.sh +++ b/scripts/giraffe-wrangler.sh @@ -9,20 +9,39 @@ usage() { exec 1>&2 printf "Usage: $0 [Options] FASTA XG_INDEX GCSA_INDEX GBWT_INDEX MINIMIZER_INDEX DISTANCE_INDEX SIM_GAM REAL_FASTQ\n" printf "\n" + printf "Inputs may be files or S3 URLs.\n" + printf "\n" + printf "Arguments:\n" + printf " FASTA FASTA reference to run bwa-mem against; may be \"\"\n" + printf " XG_INDEX XG to annotate reads with positions\n" + printf " GCSA_INDEX GCSA (with LCP) for running vg map\n" + printf " GBWT_INDEX Haplotypes for mapping with Giraffe\n" + printf " MINIMIZER_INDEX Minimizers for mapping with Giraffe\n" + printf " DISTANCE_INDEX Distances for mapping with Giraffe\n" + printf " SIM_GAM Simulated reads for measuring mapping accuracy; may be \"\"\n" + printf " REAL_FASTQ Real reads for measuring mapping performance; may be \"\"\n" + printf "\n" printf "Options:\n" - printf " -t N Use N threads\n" + printf " -s DEST Save alignments and other internal files to DEST (directory or S3 url)\n" + printf " -t N Use N threads\n" printf "\n" exit 1 } +# Define where we should save our output +OUTPUT_DEST="" + # Define the thread count for everyone. Can be changed with -t. # Should fit on a NUMA node THREAD_COUNT=24 -while getopts ":t:" o; do +while getopts ":s:t:" o; do case "${o}" in + s) + OUTPUT_DEST="${OPTARG}" + ;; t) - THREAD_COUNT=$OPTARG + THREAD_COUNT="${OPTARG}" ;; ?) usage @@ -38,10 +57,10 @@ fi echo "Using ${THREAD_COUNT} threads" fetch_input() { - # Download the specified file, if not a file already. + # Download the specified file, if not empty and not a file already. # Dumps all files into the current directory as their basenames # Output the new filename - if [[ "${1}" == s3://* ]] ; then + if [[ ! -z "${1}" && "${1}" == s3://* ]] ; then aws s3 --quiet cp "${1}" "$(basename "${1}")" basename "${1}" else @@ -50,10 +69,13 @@ fetch_input() { } FASTA="$(fetch_input "${1}")" -for EXT in amb ann bwt fai pac sa ; do - # Make sure we have all the indexes adjacent to the FASTA - fetch_input "${1}.${EXT}" >/dev/null -done +if [[ ! -z ${FASTA} ]] ; then + # We have a FASTA + for EXT in amb ann bwt fai pac sa ; do + # Make sure we have all the indexes adjacent to the FASTA + fetch_input "${1}.${EXT}" >/dev/null + done +fi shift XG_INDEX="$(fetch_input "${1}")" # Make sure we have the GBWTGraph pre-made @@ -112,60 +134,84 @@ if [[ "${NUMA_COUNT}" -gt "1" ]] ; then fi fi -if which perf >/dev/null 2>&1 ; then - # Record profile. - # Do this first because perf is likely to be misconfigured and we want to fail fast. - - # If we don't strip bin/vg to make it small, the addr2line calls that perf - # script makes take forever because the binary is huge - strip -d bin/vg - - ${NUMA_PREFIX} perf record -F 100 --call-graph dwarf -o "${WORK}/perf.data" vg gaffe "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/perf.gam" - perf script -i "${WORK}/perf.data" >"${WORK}/out.perf" - deps/FlameGraph/stackcollapse-perf.pl "${WORK}/out.perf" >"${WORK}/out.folded" - deps/FlameGraph/flamegraph.pl "${WORK}/out.folded" > "${WORK}/profile.svg" +if [[ ! -z "${REAL_FASTQ}" ]] ; then + if which perf >/dev/null 2>&1 ; then + # Record profile. + # Do this first because perf is likely to be misconfigured and we want to fail fast. + + # If we don't strip bin/vg to make it small, the addr2line calls that perf + # script makes take forever because the binary is huge + strip -d bin/vg + + ${NUMA_PREFIX} perf record -F 100 --call-graph dwarf -o "${WORK}/perf.data" vg gaffe "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/perf.gam" + perf script -i "${WORK}/perf.data" >"${WORK}/out.perf" + deps/FlameGraph/stackcollapse-perf.pl "${WORK}/out.perf" >"${WORK}/out.folded" + deps/FlameGraph/flamegraph.pl "${WORK}/out.folded" > "${WORK}/profile.svg" + fi fi -# Run simulated reads, with stats -${NUMA_PREFIX} vg gaffe --track-correctness -x "${XG_INDEX}" "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/mapped.gam" +if [[ ! -z "${SIM_GAM}" ]] ; then + # Do simulated reads + + # Run simulated reads, with stats + ${NUMA_PREFIX} vg gaffe --track-correctness -x "${XG_INDEX}" "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/mapped.gam" + + # And map to compare with them + ${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" >"${WORK}/mapped-map.gam" + + # Annotate and compare against truth + vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped.gam" >"${WORK}/annotated.gam" + vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped-map.gam" >"${WORK}/annotated-map.gam" + + # GAM compare against truth. Use gamcompare to count correct reads to save a JSON scan. + CORRECT_COUNT="$(vg gamcompare -r 100 "${WORK}/annotated.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')" + CORRECT_COUNT_MAP="$(vg gamcompare -r 100 "${WORK}/annotated-map.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')" + + # Compute identity of mapped reads + MEAN_IDENTITY="$(vg view -aj "${WORK}/mapped.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')" + MEAN_IDENTITY_MAP="$(vg view -aj "${WORK}/mapped-map.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')" -# And map to compare with them -${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" >"${WORK}/mapped-map.gam" + # Compute loss stages + # Let giraffe facts errors out + vg view -aj "${WORK}/mapped.gam" | scripts/giraffe-facts.py "${WORK}/facts" >"${WORK}/facts.txt" +fi -# Annotate and compare against truth -vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped.gam" >"${WORK}/annotated.gam" -vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped-map.gam" >"${WORK}/annotated-map.gam" +if [[ ! -z "${REAL_FASTQ}" ]] ; then + # Now do the real reads -# GAM compare against truth. Use gamcompare to count correct reads to save a JSON scan. -CORRECT_COUNT="$(vg gamcompare -r 100 "${WORK}/annotated.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')" -CORRECT_COUNT_MAP="$(vg gamcompare -r 100 "${WORK}/annotated-map.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')" + # Count them + REAL_READ_COUNT="$(cat "${REAL_FASTQ}" | wc -l)" + ((REAL_READ_COUNT /= 4)) -# Compute identity of mapped reads -MEAN_IDENTITY="$(vg view -aj "${WORK}/mapped.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')" -MEAN_IDENTITY_MAP="$(vg view -aj "${WORK}/mapped-map.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')" + # Get RPS for Giraffe + ${NUMA_PREFIX} vg gaffe -p "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/real.gam" 2>"${WORK}/log.txt" -# Compute loss stages -# Let giraffe facts errors out -vg view -aj "${WORK}/mapped.gam" | scripts/giraffe-facts.py "${WORK}/facts" >"${WORK}/facts.txt" + GIRAFFE_RPS="$(cat "${WORK}/log.txt" | grep "reads per second" | sed 's/[^0-9.]//g')" -# Now do the real reads + if [[ ! -z "${FASTA}" ]] ; then + # Get RPS for bwa-mem -# Get RPS -${NUMA_PREFIX} vg gaffe -p "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/real.gam" 2>"${WORK}/log.txt" + ${NUMA_PREFIX} bwa mem -t "${THREAD_COUNT}" "${FASTA}" "${REAL_FASTQ}" >"${WORK}/mapped.bam" 2>"${WORK}/bwa-log.txt" -GIRAFFE_RPS="$(cat "${WORK}/log.txt" | grep "reads per second" | sed 's/[^0-9.]//g')" + # Now we get all the batch times from BWA and use those to compute RPS values. + # This is optimistic but hopefully consistent. + BWA_RPS_ALL_THREADS="$(cat "${WORK}/bwa-log.txt" | grep "Processed" | sed 's/[^0-9]*\([0-9]*\) reads in .* CPU sec, \([0-9]*\.[0-9]*\) real sec/\1 \2/g' | tr ' ' '\t' | awk '{sum1+=$1; sum2+=$2} END {print sum1/sum2}')" -# Get RPS for bwa-mem -REAL_READ_COUNT="$(cat "${REAL_FASTQ}" | wc -l)" -((REAL_READ_COUNT /= 4)) + BWA_RPS="$(echo "${BWA_RPS_ALL_THREADS} / ${THREAD_COUNT}" | bc -l)" + + fi -${NUMA_PREFIX} bwa mem -t "${THREAD_COUNT}" "${FASTA}" "${REAL_FASTQ}" >"${WORK}/mapped.bam" 2>"${WORK}/bwa-log.txt" + # Align the real reads with map, ignoring speed + ${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" >"${WORK}/real-map.gam" -# Now we get all the batch times from BWA and use those to compute RPS values. -# This is optimistic but hopefully consistent. -BWA_RPS_ALL_THREADS="$(cat "${WORK}/bwa-log.txt" | grep "Processed" | sed 's/[^0-9]*\([0-9]*\) reads in .* CPU sec, \([0-9]*\.[0-9]*\) real sec/\1 \2/g' | tr ' ' '\t' | awk '{sum1+=$1; sum2+=$2} END {print sum1/sum2}')" + # Compute stats for giraffe and map on real reads + echo "Real read stats:" >"${WORK}/real-stats.txt" + echo "Giraffe:" >>"${WORK}/real-stats.txt" + vg stats -a "${WORK}/real.gam" >>"${WORK}/real-stats.txt" 2>&1 + echo "Map:" >>"${WORK}/real-stats.txt" + vg stats -a "${WORK}/real-map.gam" >>"${WORK}/real-stats.txt" 2>&1 +fi -BWA_RPS="$(echo "${BWA_RPS_ALL_THREADS} / ${THREAD_COUNT}" | bc -l)" echo "==== Giraffe Wrangler Report for vg $(vg version -s) ====" @@ -173,20 +219,44 @@ if [[ "${NUMA_WARNING}" == "1" ]] ; then echo "WARNING! Unable to restrict to a single NUMA node! Results may have high variance!" fi -if which perf >/dev/null 2>&1 ; then - # Output perf stuff - mv "${WORK}/perf.data" ./perf.data - mv "${WORK}/profile.svg" ./profile.svg - echo "Profiling information saved as ./perf.data" - echo "Interactive flame graph (for browsers) saved as ./profile.svg" +if [[ ! -z "${REAL_FASTQ}" ]] ; then + if which perf >/dev/null 2>&1 ; then + # Output perf stuff + mv "${WORK}/perf.data" ./perf.data + mv "${WORK}/profile.svg" ./profile.svg + echo "Profiling information saved as ./perf.data" + echo "Interactive flame graph (for browsers) saved as ./profile.svg" + fi fi # Print the report -echo "Giraffe got ${CORRECT_COUNT} simulated reads correct with ${MEAN_IDENTITY} average identity per mapped base" -echo "Map got ${CORRECT_COUNT_MAP} simulated reads correct with ${MEAN_IDENTITY_MAP} average identity per mapped base" -echo "Giraffe aligned real reads at ${GIRAFFE_RPS} reads/second vs. bwa-mem's ${BWA_RPS} reads/second on ${THREAD_COUNT} threads" +if [[ ! -z "${SIM_GAM}" ]] ; then + # Include simulated reads + echo "Giraffe got ${CORRECT_COUNT} simulated reads correct with ${MEAN_IDENTITY} average identity per mapped base" + echo "Map got ${CORRECT_COUNT_MAP} simulated reads correct with ${MEAN_IDENTITY_MAP} average identity per mapped base" +fi +if [[ ! -z "${REAL_FASTQ}" ]] ; then + # Include real reads + echo "Giraffe aligned real reads at ${GIRAFFE_RPS} reads/second on ${THREAD_COUNT} threads" + if [[ ! -z "${FASTA}" ]] ; then + echo "bwa-mem aligned real reads at ${BWA_RPS} reads/second on ${THREAD_COUNT} threads" + fi +fi + +if [[ ! -z "${SIM_GAM}" ]] ; then + # Print Giraffe Facts for simulated reads + cat "${WORK}/facts.txt" +fi -cat "${WORK}/facts.txt" +if [[ ! -z "${REAL_FASTQ}" ]] ; then + # Print real read stats + cat "${WORK}/real-stats.txt" +fi + +if [[ ! -z "${OUTPUT_DEST}" ]] ; then + # Save our intermediates + aws s3 cp --recursive "${WORK}" "${OUTPUT_DEST}" +fi rm -Rf "${WORK}" From 9616c75c83e010576563cbdf67c5ad972692c83b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2019 11:45:20 -0800 Subject: [PATCH 16/79] Handle case where libsdsl.a exists but shouldn't `$(LIB_DIR)/libdivsufsort.a` and `$(LIB_DIR)/libdivsufsort64.a` are made by the SDSL build process but might not exist sometimes in the Travis cache. See https://stackoverflow.com/q/3046117 --- Makefile | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 55e69cb8494..f087d42fd12 100644 --- a/Makefile +++ b/Makefile @@ -381,8 +381,16 @@ ifeq ($(shell uname -s),Darwin) +. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER) else +. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER) -endif +endif + +# Make sure the divsufsort libraries also come from SDSL +$(LIB_DIR)/libdivsufsort.a: $(LIB_DIR)/libsdsl.a + @ +$(LIB_DIR)/libdivsufsort64.a: $(LIB_DIR)/libsdsl.a + @ + +.SECONDARY: $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.h +. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(LIB_DIR) @@ -397,7 +405,7 @@ $(LIB_DIR)/librocksdb.a: $(LIB_DIR)/libsnappy.a $(ROCKSDB_DIR)/db/*.cc $(ROCKSDB $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a -$(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h) +$(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h) ifeq ($(shell uname -s),Darwin) +. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) libgcsa2.a $(FILTER) && mv libgcsa2.a $(CWD)/$(LIB_DIR) else @@ -406,7 +414,7 @@ endif $(INC_DIR)/gbwt/dynamic_gbwt.h: $(LIB_DIR)/libgbwt.a -$(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GBWT_DIR)/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h) +$(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GBWT_DIR)/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h) ifeq ($(shell uname -s),Darwin) +. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv libgbwt.a $(CWD)/$(LIB_DIR) else @@ -415,7 +423,7 @@ endif $(INC_DIR)/gbwtgraph/gbwtgraph.h: $(LIB_DIR)/libgbwtgraph.a -$(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h) +$(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h) ifeq ($(shell uname -s),Darwin) +. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv libgbwtgraph.a $(CWD)/$(LIB_DIR) else @@ -614,7 +622,7 @@ $(LIB_DIR)/libfml.a: $(FERMI_DIR)/*.h $(FERMI_DIR)/*.c $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB_DIR)/libhts.a . ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ -$(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/src/*.cpp $(LIBBDSG_DIR)/include/bdsg/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp +$(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/src/*.cpp $(LIBBDSG_DIR)/include/bdsg/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp +. ./source_me.sh && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && pwd && cp -r include/bdsg $(CWD)/$(INC_DIR) $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp @@ -626,7 +634,7 @@ $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/* # The xg repo has a cmake build system based all around external projects, and # we need it to use our installed versions of everything instead. -$(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(INC_DIR)/gfakluge.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a +$(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(INC_DIR)/gfakluge.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a +rm -f $@ +cp -r $(XG_DIR)/src/*.hpp $(CWD)/$(INC_DIR) +. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER) From 4c8aa98c569cba26c9708ef5138a38051e2adf8c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 4 Nov 2019 15:43:43 -0500 Subject: [PATCH 17/79] small test for vg depth --- src/algorithms/coverage_depth.cpp | 4 +++- src/algorithms/coverage_depth.hpp | 2 ++ src/subcommand/depth_main.cpp | 21 ++++++++++++--------- test/t/49_vg_depth.t | 21 +++++++++++++++++++++ 4 files changed, 38 insertions(+), 10 deletions(-) create mode 100644 test/t/49_vg_depth.t diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index 351ed63e1f0..b68b8c50b55 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -23,7 +23,9 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov for (size_t i = 0; i < cur_len; ++i) { cur_pos.set_offset(i); size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos)); - out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n"; + if (pos_coverage >= min_coverage) { + out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n"; + } ++path_offset; } } diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp index 18d70daa26a..730cd316f5a 100644 --- a/src/algorithms/coverage_depth.hpp +++ b/src/algorithms/coverage_depth.hpp @@ -27,6 +27,8 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov /// (which is contained in the bin) will get the deletion edge's coverage counted. /// Other types of events (such as SNPs) can throw off coverage in similar ways but deletions tend to be bigger /// (and easier to find), so we hope that counting them is enough. +/// If one wants to infer deletions from the coverage, obviously this should be false, but if looking for +/// a background coverage for genotyping, then setting it to true may be helpful pair packed_depth_of_bin(const Packer& packer, step_handle_t start_step, step_handle_t end_plus_one_step, size_t min_coverage, bool include_deletions); diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp index 0e5fd75b1e4..d60f53177b9 100644 --- a/src/subcommand/depth_main.cpp +++ b/src/subcommand/depth_main.cpp @@ -29,18 +29,18 @@ void help_depth(char** argv) { cerr << "usage: " << argv[0] << " depth [options] " << endl << "options:" << endl << " packed coverage depth (print positional depths along path):" << endl - << " -k, --pack FILE Supports created from vg pack for given input graph" << endl - << " -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths)" << endl - << " -b, --bin-size N Bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl - << " -d, --count-dels Count deletion edges within the bin as covering reference positions" << endl + << " -k, --pack FILE supports created from vg pack for given input graph" << endl + << " -p, --ref-path NAME reference path to call on (multipile allowed. defaults to all paths)" << endl + << " -b, --bin-size N bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl + << " -d, --count-dels count deletion edges within the bin as covering reference positions" << endl << " GAM coverage depth (print for depth):" << endl << " -g, --gam FILE read alignments from this file (could be '-' for stdin)" << endl << " -n, --max-nodes N maximum nodes to consider [1000000]" << endl << " -s, --random-seed N random seed for sampling nodes to consider" << endl - << " -Q, --min-mapq N ignore alignments with mapping quality < N" << endl + << " -Q, --min-mapq N ignore alignments with mapping quality < N [0]" << endl << " common options:" << endl - << " -m, --min-coverage N ignore nodes with less than N coverage [1]" << endl - << " -t, --threads N Number of threads to use [all available]" << endl; + << " -m, --min-coverage N ignore nodes with less than N coverage [1]" << endl + << " -t, --threads N number of threads to use [all available]" << endl; } int main_depth(int argc, char** argv) { @@ -192,8 +192,11 @@ int main_depth(int argc, char** argv) { vector> binned_depth = algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels); for (auto& bin_cov : binned_depth) { - cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov) - << "\t" << sqrt(get<3>(bin_cov)) << endl; + // bins can ben nan if min_coverage filters everything out. just skip + if (!isnan(get<3>(bin_cov))) { + cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov) + << "\t" << sqrt(get<3>(bin_cov)) << endl; + } } } else { algorithms::packed_depths(*packer, ref_path, min_coverage, cout); diff --git a/test/t/49_vg_depth.t b/test/t/49_vg_depth.t new file mode 100644 index 00000000000..fb2d46d29fc --- /dev/null +++ b/test/t/49_vg_depth.t @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +BASH_TAP_ROOT=../deps/bash-tap +. ../deps/bash-tap/bash-tap-bootstrap + +PATH=../bin:$PATH # for vg + +plan tests 3 + +vg construct -m 10 -r tiny/tiny.fa >flat.vg +vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTTTCTGGAGATCTATTATACTCCAACTCTCTG/' | vg view -Fv - >2snp.vg +vg sim -l 30 -x 2snp.vg -n 30 -a >2snp.sim +vg index -x flat.xg -g flat.gcsa -k 16 flat.vg +vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam +vg pack -x flat.xg -o 2snp.gam.cx -g 2snp.gam +# total read bases (30 * 30) / total graph bases 50 = 18 +is $(vg depth flat.vg -g 2snp.gam | awk '{print $1}') 18 "vg depth gets correct depth from gam" +is $(vg depth flat.xg -k 2snp.gam.cx -b 100000 | awk '{print $4}') 18 "vg depth gets correct depth from pack" +is $(vg depth flat.xg -k 2snp.gam.cx -b 10 | wc -l) 5 "vg depth gets correct number of bins" + +rm -f flat.vg flat.gcsa flat.xg 2snp.vg 2snp.sim 2snp.gam 2snp.gam.cx From 0b819df32af63490dd9a2e77286d5d49f47bd150 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2019 13:19:06 -0800 Subject: [PATCH 18/79] Check if SDSL build actually happened, and re-copy --- Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index f087d42fd12..ec110cfcd6c 100644 --- a/Makefile +++ b/Makefile @@ -376,7 +376,7 @@ test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/json2pb.h $ $(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c +. ./source_me.sh && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) $(FILTER) && cp -r lib/* $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ -$(LIB_DIR)/libsdsl.a: $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp +$(LIB_DIR)/libsdsl.a: $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp $(SDSL_DIR)/build/lib/libsdsl.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a ifeq ($(shell uname -s),Darwin) +. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER) else @@ -384,11 +384,12 @@ else endif # Make sure the divsufsort libraries also come from SDSL +# They might get deleted after libsdsl is installed $(LIB_DIR)/libdivsufsort.a: $(LIB_DIR)/libsdsl.a - @ + cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a: $(LIB_DIR)/libsdsl.a - @ + cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a $(LIB_DIR)/libdivsufsort64.a .SECONDARY: $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a From f280cb4cfffe9250457be53e45ab7817f5077933 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2019 13:19:54 -0800 Subject: [PATCH 19/79] Catch if distance index is missing snarls at use time --- scripts/giraffe-wrangler.sh | 2 +- src/min_distance.hpp | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh index aeb2ce6d275..4e30c85df28 100755 --- a/scripts/giraffe-wrangler.sh +++ b/scripts/giraffe-wrangler.sh @@ -13,7 +13,7 @@ usage() { printf "\n" printf "Arguments:\n" printf " FASTA FASTA reference to run bwa-mem against; may be \"\"\n" - printf " XG_INDEX XG to annotate reads with positions\n" + printf " XG_INDEX XG to annotate reads with positions, with corresponding .gg GBWTGraph\n" printf " GCSA_INDEX GCSA (with LCP) for running vg map\n" printf " GBWT_INDEX Haplotypes for mapping with Giraffe\n" printf " MINIMIZER_INDEX Minimizers for mapping with Giraffe\n" diff --git a/src/min_distance.hpp b/src/min_distance.hpp index 7bcdc95bdf6..20ea9f6d715 100644 --- a/src/min_distance.hpp +++ b/src/min_distance.hpp @@ -385,8 +385,15 @@ class MinimumDistanceIndex { pair common_ancestor, pos_t& pos, bool rev) const; - //Get the index into chain_indexes/rank in chain of node i + /// Get the index into chain_indexes/rank in chain of node i. + /// Detects and throws an error if node i never got assigned to a snarl. size_t getPrimaryAssignment(id_t i) const { + auto stored = primary_snarl_assignments[i - min_node_id]; + if (stored == 0) { + // Somebody asked for a node. It should be assigned to a snarl, but it isn't. + throw runtime_error("Node " + std::to_string(i) + " not in any snarl. Distance index does " + + "not match graph or was not generated from a snarl set including trivial snarls."); + } return primary_snarl_assignments[i - min_node_id] - 1; } From ce6b5e6bc3a7d928d0e743786bb6427cf4f899ed Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 4 Nov 2019 16:48:53 -0500 Subject: [PATCH 20/79] make sim test more robust --- test/t/49_vg_depth.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/49_vg_depth.t b/test/t/49_vg_depth.t index fb2d46d29fc..c77a34a82d4 100644 --- a/test/t/49_vg_depth.t +++ b/test/t/49_vg_depth.t @@ -9,13 +9,13 @@ plan tests 3 vg construct -m 10 -r tiny/tiny.fa >flat.vg vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTTTCTGGAGATCTATTATACTCCAACTCTCTG/' | vg view -Fv - >2snp.vg -vg sim -l 30 -x 2snp.vg -n 30 -a >2snp.sim +vg sim -l 30 -x 2snp.vg -n 30 -a -s 1 >2snp.sim vg index -x flat.xg -g flat.gcsa -k 16 flat.vg vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam vg pack -x flat.xg -o 2snp.gam.cx -g 2snp.gam # total read bases (30 * 30) / total graph bases 50 = 18 is $(vg depth flat.vg -g 2snp.gam | awk '{print $1}') 18 "vg depth gets correct depth from gam" -is $(vg depth flat.xg -k 2snp.gam.cx -b 100000 | awk '{print $4}') 18 "vg depth gets correct depth from pack" +is $(vg depth flat.xg -k 2snp.gam.cx -b 100000 | awk '{print int($4)}') 18 "vg depth gets correct depth from pack" is $(vg depth flat.xg -k 2snp.gam.cx -b 10 | wc -l) 5 "vg depth gets correct number of bins" rm -f flat.vg flat.gcsa flat.xg 2snp.vg 2snp.sim 2snp.gam 2snp.gam.cx From 6047991af595ebdc25a8b9396a18ddad640aeffb Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2019 14:20:01 -0800 Subject: [PATCH 21/79] Use pattern rule to express multiple SDSL libs --- Makefile | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index ec110cfcd6c..a99320eb884 100644 --- a/Makefile +++ b/Makefile @@ -376,23 +376,16 @@ test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/json2pb.h $ $(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c +. ./source_me.sh && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) $(FILTER) && cp -r lib/* $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ -$(LIB_DIR)/libsdsl.a: $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp $(SDSL_DIR)/build/lib/libsdsl.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a +# Use fake patterns to tell Make that this rule generates all these files when run once. +# Here % should always match "lib" which is a common substring. +# See https://stackoverflow.com/a/19822767 +$(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp ifeq ($(shell uname -s),Darwin) +. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER) else +. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER) endif -# Make sure the divsufsort libraries also come from SDSL -# They might get deleted after libsdsl is installed -$(LIB_DIR)/libdivsufsort.a: $(LIB_DIR)/libsdsl.a - cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(LIB_DIR)/libdivsufsort.a - -$(LIB_DIR)/libdivsufsort64.a: $(LIB_DIR)/libsdsl.a - cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a $(LIB_DIR)/libdivsufsort64.a - -.SECONDARY: $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a - $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.h +. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(LIB_DIR) From 07a23d9ce3e6249fb53b384a1b8a8c9df2135214 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2019 14:21:44 -0800 Subject: [PATCH 22/79] Don't try and use aws s3 to move files locally --- scripts/giraffe-wrangler.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh index 4e30c85df28..908d21e9f08 100755 --- a/scripts/giraffe-wrangler.sh +++ b/scripts/giraffe-wrangler.sh @@ -254,8 +254,13 @@ if [[ ! -z "${REAL_FASTQ}" ]] ; then fi if [[ ! -z "${OUTPUT_DEST}" ]] ; then - # Save our intermediates - aws s3 cp --recursive "${WORK}" "${OUTPUT_DEST}" + if [[ "${OUTPUT_DEST}" == s3://* ]] ; then + # Save our intermediates to S3 + aws s3 cp --recursive "${WORK}" "${OUTPUT_DEST}" + else + # Save our intermediates to disk + cp -R "${WORK}" "${OUTPUT_DEST}" + fi fi rm -Rf "${WORK}" From a7a1220d580651b6fe502a1a3921915c049780a3 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Mon, 4 Nov 2019 15:48:38 -0800 Subject: [PATCH 23/79] added bidirectional gbwt option --- src/subcommand/rna_main.cpp | 13 ++++++++++--- src/transcriptome.cpp | 4 ++-- src/transcriptome.hpp | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp index c40151dd5a5..1d180231ded 100644 --- a/src/subcommand/rna_main.cpp +++ b/src/subcommand/rna_main.cpp @@ -33,6 +33,7 @@ void help_rna(char** argv) { << " -a, --add-non-ref-paths add non-reference transcripts as embedded paths in the splice graph" << endl << " -u, --out-ref-paths output reference transcripts in GBWT, fasta and info" << endl << " -b, --write-gbwt FILE write transcripts as threads to GBWT index file" << endl + << " -g, --gbwt-bidirectional add transcripts as bidirectional threads to GBWT index" << endl << " -f, --write-fasta FILE write transcripts as sequences to fasta file" << endl << " -i, --write-info FILE write transcript origin info to tsv file" << endl << " -t, --threads INT number of compute threads to use [1]" << endl @@ -59,6 +60,7 @@ int32_t main_rna(int32_t argc, char** argv) { bool add_non_reference_transcript_paths = false; bool output_reference_transcript_paths = false; string gbwt_out_filename = ""; + bool gbwt_add_bidirectional = false; string fasta_out_filename = ""; string info_out_filename = ""; int32_t num_threads = 1; @@ -81,6 +83,7 @@ int32_t main_rna(int32_t argc, char** argv) { {"add-non-ref-paths", no_argument, 0, 'a'}, {"out-ref-paths", no_argument, 0, 'u'}, {"write-gbwt", no_argument, 0, 'b'}, + {"gbwt-bidirectional", no_argument, 0, 'g'}, {"write-fasta", no_argument, 0, 'f'}, {"write-info", no_argument, 0, 'i'}, {"threads", no_argument, 0, 't'}, @@ -90,7 +93,7 @@ int32_t main_rna(int32_t argc, char** argv) { }; int32_t option_index = 0; - c = getopt_long(argc, argv, "n:s:l:ercdoraub:f:i:t:ph?", long_options, &option_index); + c = getopt_long(argc, argv, "n:s:l:ercdoraub:gf:i:t:ph?", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -143,6 +146,10 @@ int32_t main_rna(int32_t argc, char** argv) { gbwt_out_filename = optarg; break; + case 'g': + gbwt_add_bidirectional = true; + break; + case 'f': fasta_out_filename = optarg; break; @@ -283,13 +290,13 @@ int32_t main_rna(int32_t argc, char** argv) { // Construct and write GBWT index of transcript paths in transcriptome. if (!gbwt_out_filename.empty()) { - if (show_progress) { cerr << "[vg rna] Writing transcripts as threads to GBWT index file ..." << endl; } + if (show_progress) { cerr << "[vg rna] Writing transcripts as " << ((gbwt_add_bidirectional) ? "bidirectional " : "") << "threads to GBWT index file ..." << endl; } // Silence GBWT index construction. gbwt::Verbosity::set(gbwt::Verbosity::SILENT); gbwt::GBWTBuilder gbwt_builder(gbwt::bit_length(gbwt::Node::encode(transcriptome.splice_graph().max_node_id(), true))); - transcriptome.construct_gbwt(&gbwt_builder, output_reference_transcript_paths); + transcriptome.construct_gbwt(&gbwt_builder, output_reference_transcript_paths, gbwt_add_bidirectional); // Finish contruction and recode index. gbwt_builder.finish(); diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index bf2f1ef4b07..7c7e4866677 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -1109,7 +1109,7 @@ void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const } } -void Transcriptome::construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts) const { +void Transcriptome::construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts, const bool add_bidirectional) const { vector sample_names; sample_names.reserve(size()); @@ -1131,7 +1131,7 @@ void Transcriptome::construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool } // Insert transcript path as thread into GBWT index. - gbwt_builder->insert(gbwt_thread, false); + gbwt_builder->insert(gbwt_thread, add_bidirectional); // Insert transcript path name into GBWT index. gbwt_builder->index.metadata.addPath({static_cast(sample_names.size()), 0, 0, 0}); diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index 3abbd1210e9..2ace89c60af 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -125,7 +125,7 @@ class Transcriptome { void embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths, const bool rebuild_indexes); /// Add transcript paths as threads in GBWT index. - void construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts) const; + void construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts, const bool add_bidirectional) const; /// Writes transcript paths as alignments to a gam file. void write_alignments(ostream * gam_ostream, const bool output_reference_transcripts) const; From f17fce05ee96a2cf0bba1038546011631f1bd3a1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 5 Nov 2019 13:45:48 -0500 Subject: [PATCH 24/79] refactor support computation apart from genotyping --- src/graph_caller.cpp | 10 +- src/snarl_caller.cpp | 339 +++-------------------------------- src/snarl_caller.hpp | 79 +------- src/subcommand/call_main.cpp | 9 +- src/traversal_support.cpp | 321 +++++++++++++++++++++++++++++++++ src/traversal_support.hpp | 113 ++++++++++++ 6 files changed, 476 insertions(+), 395 deletions(-) create mode 100644 src/traversal_support.cpp create mode 100644 src/traversal_support.hpp diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 3da529f2d3f..4387ef0b63d 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -296,8 +296,8 @@ LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph, 0, 0, get_path_index, - [&](id_t id) { return snarl_caller.get_min_node_support(id);}, - [&](edge_t edge) { return snarl_caller.get_edge_support(edge);}); + [&](id_t id) { return snarl_caller.get_support_finder().get_min_node_support(id);}, + [&](edge_t edge) { return snarl_caller.get_support_finder().get_edge_support(edge);}); } else { // our graph is not in vg format. we will make graphs for each site as needed and work with those @@ -364,7 +364,7 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { // determine the support threshold for the traversal finder. if we're using average // support, then we don't use any (set to 0), other wise, use the minimum support for a call SupportBasedSnarlCaller& support_caller = dynamic_cast(snarl_caller); - size_t threshold = support_caller.get_average_traversal_support_switch_threshold(); + size_t threshold = support_caller.get_support_finder().get_average_traversal_support_switch_threshold(); double support_cutoff = total_snarl_length <= threshold ? support_caller.get_min_total_support_for_call() : 0; rep_trav_finder = new RepresentativeTraversalFinder(vg_graph, snarl_manager, max_search_depth, @@ -373,10 +373,10 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { support_cutoff, support_cutoff, get_path_index, - [&](id_t id) { return support_caller.get_min_node_support(id);}, + [&](id_t id) { return support_caller.get_support_finder().get_min_node_support(id);}, // note: because our traversal finder and support caller have // different graphs, they can't share edge handles - [&](edge_t edge) { return support_caller.get_edge_support( + [&](edge_t edge) { return support_caller.get_support_finder().get_edge_support( vg_graph.get_id(edge.first), vg_graph.get_is_reverse(edge.first), vg_graph.get_id(edge.second), vg_graph.get_is_reverse(edge.second));}); diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index b323785e679..e13c66fb6e9 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -13,9 +13,11 @@ function SnarlCaller::get_skip_allele_fn() const { return [](const SnarlTraversal&) { return false; }; } -SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager) : +SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder) : graph(graph), - snarl_manager(snarl_manager) { + snarl_manager(snarl_manager), + support_finder(support_finder) { } SupportBasedSnarlCaller::~SupportBasedSnarlCaller() { @@ -57,10 +59,10 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, #endif // get the traversal sizes - vector traversal_sizes = get_traversal_sizes(traversals); + vector traversal_sizes = support_finder.get_traversal_sizes(traversals); // get the supports of each traversal independently - vector supports = get_traversal_set_support(traversals, {}, false, false, ref_trav_idx); + vector supports = support_finder.get_traversal_set_support(traversals, {}, false, false, ref_trav_idx); int best_allele = get_best_support(supports, {}); #ifdef debug @@ -75,7 +77,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, // we prune out traversals whose exclusive support (structure that is not shared with best traversal) // doesn't meet a certain cutoff - vector secondary_exclusive_supports = get_traversal_set_support(traversals, {best_allele}, true, false, ref_trav_idx); + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, ref_trav_idx); vector skips = {best_allele}; for (int i = 0; i < secondary_exclusive_supports.size(); ++i) { double bias = get_bias(traversal_sizes, i, best_allele, ref_trav_idx); @@ -88,7 +90,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, } } // get the supports of each traversal in light of best - vector secondary_supports = get_traversal_set_support(traversals, {best_allele}, false, false, ref_trav_idx); + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, ref_trav_idx); int second_best_allele = get_best_support(secondary_supports, {skips}); // get the supports of each traversal in light of second best @@ -97,7 +99,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, int third_best_allele = -1; if (second_best_allele != -1) { // prune out traversals whose exclusive support relative to second best doesn't pass cut - vector tertiary_exclusive_supports = get_traversal_set_support(traversals, {second_best_allele}, true, false, ref_trav_idx); + vector tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, ref_trav_idx); skips.push_back(best_allele); skips.push_back(second_best_allele); for (int i = 0; i < tertiary_exclusive_supports.size(); ++i) { @@ -106,7 +108,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, skips.push_back(i); } } - tertiary_supports = get_traversal_set_support(traversals, {second_best_allele}, false, false, ref_trav_idx); + tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, ref_trav_idx); third_best_allele = get_best_support(tertiary_supports, skips); } @@ -253,11 +255,11 @@ void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl, shared_travs.push_back(genotype[0]); } // compute the support of our called alleles - vector allele_supports = get_traversal_set_support(traversals, shared_travs, false, false, 0); + vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, 0); // get the support of our uncalled alleles, making sure to not include any called support // TODO: handle shared support within this set - vector uncalled_supports = get_traversal_set_support(traversals, genotype, false, true, 0); + vector uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, 0); // Set up the depth format field variant.format.push_back("DP"); @@ -366,221 +368,6 @@ void SupportBasedSnarlCaller::update_vcf_header(string& header) const { std::to_string(min_site_depth) + "\">\n"; } -function SupportBasedSnarlCaller::get_skip_allele_fn() const { - // port over cutoff used in old support caller (there avg support used all the time, here - // we use the same toggles as when genotyping) - return [&](const SnarlTraversal& trav) -> bool { - return support_val(get_traversal_support(trav)) < min_alt_path_support; - }; -} - -int64_t SupportBasedSnarlCaller::get_edge_length(const edge_t& edge, const unordered_map& ref_offsets) const { - int len = -1; - // use our reference traversal to try to come up with a deletion length for our edge - // idea: if our edge corresponds to a huge deltion, it should be weighted accordingly - auto s_it = ref_offsets.find(graph.get_id(edge.first)); - auto e_it = ref_offsets.find(graph.get_id(edge.second)); - if (s_it != ref_offsets.end() && e_it != ref_offsets.end()) { - size_t start_offset = s_it->second; - if (!graph.get_is_reverse(edge.first)) { - start_offset += graph.get_length(edge.first); - } - size_t end_offset = e_it->second; - if (graph.get_is_reverse(edge.second)) { - end_offset += graph.get_length(edge.second); - } - if (start_offset > end_offset) { - std::swap(start_offset, end_offset); - } - len = end_offset - start_offset; - } - return std::max(len, 1); -} - -tuple SupportBasedSnarlCaller::get_child_support(const Snarl& snarl) const { - // port over old functionality from support caller - // todo: do we need to flag nodes as covered like it does? - pair, unordered_set > contents = snarl_manager.deep_contents(&snarl, graph, true); - Support child_max_support; - Support child_total_support; - size_t child_size = 0; - for (id_t node_id : contents.first) { - Support child_support = get_avg_node_support(node_id); - child_max_support = support_max(child_max_support, child_support); - child_size += graph.get_length(graph.get_handle(node_id)); - child_total_support += child_support; - } - Support child_avg_support = child_total_support / child_size; - // we always use child_max like the old support_caller. - // this is the only way to get top-down recursion to work in many cases - // todo: fix to use bottom up, get get support from actual traversals - // every time!! - return std::tie(child_max_support, child_max_support, child_size); -} - - -Support SupportBasedSnarlCaller::get_traversal_support(const SnarlTraversal& traversal) const { - return get_traversal_set_support({traversal}, {}, false, false).at(0); -} - -vector SupportBasedSnarlCaller::get_traversal_set_support(const vector& traversals, - const vector& shared_travs, - bool exclusive_only, - bool exclusive_count, - int ref_trav_idx) const { - - // pass 1: how many times have we seen a node or edge - unordered_map node_counts; - unordered_map edge_counts; - map child_counts; - - for (auto trav_idx : shared_travs) { - const SnarlTraversal& trav = traversals[trav_idx]; - for (int i = 0; i < trav.visit_size(); ++i) { - const Visit& visit = trav.visit(i); - if (visit.node_id() != 0) { - // Count the node once - if (node_counts.count(visit.node_id())) { - node_counts[visit.node_id()] += 1; - } else { - node_counts[visit.node_id()] = 1; - } - } else { - // Count the child once - if (child_counts.count(visit.snarl())) { - child_counts[visit.snarl()] += 1; - } else { - child_counts[visit.snarl()] = 1; - } - } - // note: there is no edge between adjacent snarls as they overlap - // on their endpoints. - if (i > 0 && (trav.visit(i - 1).node_id() != 0 || trav.visit(i).node_id() != 0)) { - edge_t edge = to_edge(graph, trav.visit(i - 1), visit); - // Count the edge once - if (edge_counts.count(edge)) { - edge_counts[edge] += 1; - } else { - edge_counts[edge] = 1; - } - } - } - } - - // pass 1.5: get index for looking up deletion edge lengths (so far we aren't dependent - // on having anything but a path handle graph, so we index on the fly) - unordered_map ref_offsets; - if (ref_trav_idx >= 0) { - ref_offsets = get_ref_offsets(traversals[ref_trav_idx]); - } - - // pass 2: get the supports - // we compute the various combinations of min/avg node/trav supports as we don't know which - // we will need until all the sizes are known - Support max_support; - max_support.set_forward(numeric_limits::max()); - vector min_supports_min(traversals.size(), max_support); // use min node support - vector min_supports_avg(traversals.size(), max_support); // use avg node support - vector has_support(traversals.size(), false); - vector tot_supports_min(traversals.size()); // weighted by lengths, using min node support - vector tot_supports_avg(traversals.size()); // weighted by lengths, using avg node support - vector tot_sizes(traversals.size(), 0); // to compute average from to_supports; - vector tot_sizes_all(traversals.size(), 0); // as above, but includes excluded lengths - int max_trav_size = 0; // size of longest traversal - - bool count_end_nodes = false; // toggle to include snarl ends - - auto update_support = [&] (int trav_idx, const Support& min_support, - const Support& avg_support, int length, int share_count) { - // keep track of overall size of longest traversal - tot_sizes_all[trav_idx] += length; - max_trav_size = std::max(tot_sizes_all[trav_idx], max_trav_size); - - // apply the scaling - double scale_factor = ((exclusive_only || exclusive_count) && share_count > 0) ? 0. : 1. / (1. + share_count); - - // when looking at exclusive support, we don't normalize by skipped lengths - if (scale_factor != 0 || !exclusive_only || exclusive_count) { - has_support[trav_idx] = true; - Support scaled_support_min = min_support * scale_factor; - Support scaled_support_avg = avg_support * scale_factor; - - tot_supports_min[trav_idx] += scaled_support_min; - tot_supports_avg[trav_idx] += scaled_support_avg * length; - tot_sizes[trav_idx] += length; - min_supports_min[trav_idx] = support_min(min_supports_min[trav_idx], scaled_support_min); - min_supports_avg[trav_idx] = support_min(min_supports_avg[trav_idx], scaled_support_avg * length); - } - }; - - for (int trav_idx = 0; trav_idx < traversals.size(); ++trav_idx) { - const SnarlTraversal& trav = traversals[trav_idx]; - for (int visit_idx = 0; visit_idx < trav.visit_size(); ++visit_idx) { - const Visit& visit = trav.visit(visit_idx); - Support min_support; - Support avg_support; - int64_t length; - int share_count = 0; - - if (visit.node_id() != 0) { - // get the node support - min_support = get_min_node_support(visit.node_id()); - avg_support = get_avg_node_support(visit.node_id()); - length = graph.get_length(graph.get_handle(visit.node_id())); - if (node_counts.count(visit.node_id())) { - share_count = node_counts[visit.node_id()]; - } - } else { - // get the child support - tie(min_support, avg_support, length) = get_child_support(visit.snarl()); - if (child_counts.count(visit.snarl())) { - share_count = child_counts[visit.snarl()]; - } - } - if (count_end_nodes || (visit_idx > 0 && visit_idx < trav.visit_size() - 1)) { - update_support(trav_idx, min_support, avg_support, length, share_count); - } - share_count = 0; - - if (visit_idx > 0 && (trav.visit(visit_idx - 1).node_id() != 0 || trav.visit(visit_idx).node_id() != 0)) { - // get the edge support - edge_t edge = to_edge(graph, trav.visit(visit_idx - 1), visit); - min_support = get_edge_support(edge); - length = get_edge_length(edge, ref_offsets); - if (edge_counts.count(edge)) { - share_count = edge_counts[edge]; - } - update_support(trav_idx, min_support, min_support, length, share_count); - } - } - } - - // correct for case where no exclusive support found - for (int i = 0; i < min_supports_min.size(); ++i) { - if (!has_support[i]) { - min_supports_min[i] = Support(); - min_supports_avg[i] = Support(); - } - } - - bool use_avg_trav_support = max_trav_size >= average_traversal_support_switch_threshold; - bool use_avg_node_support = max_trav_size >= average_node_support_switch_threshold; - - if (use_avg_trav_support) { - vector& tot_supports = use_avg_node_support ? tot_supports_avg : tot_supports_min; - for (int i = 0; i < tot_supports.size(); ++i) { - if (tot_sizes[i] > 0) { - tot_supports[i] /= (double)tot_sizes[i]; - } else { - tot_supports[i] = Support(); - } - } - return tot_supports; - } else { - return use_avg_node_support ? min_supports_avg : min_supports_min; - } -} - int SupportBasedSnarlCaller::get_best_support(const vector& supports, const vector& skips) { int best_allele = -1; for(size_t i = 0; i < supports.size(); i++) { @@ -592,35 +379,24 @@ int SupportBasedSnarlCaller::get_best_support(const vector& supports, c return best_allele; } -vector SupportBasedSnarlCaller::get_traversal_sizes(const vector& traversals) const { - vector sizes(traversals.size(), 0); - for (int i = 0; i < traversals.size(); ++i) { - for (int j = 0; j < traversals[i].visit_size(); ++j) { - if (traversals[i].visit(j).node_id() != 0) { - sizes[i] += graph.get_length(graph.get_handle(traversals[i].visit(j).node_id())); - } else { - // just summing up the snarl contents, which isn't a great heuristic but will - // help in some cases - pair, unordered_set > contents = snarl_manager.deep_contents( - snarl_manager.into_which_snarl(traversals[i].visit(j)), graph, true); - for (id_t node_id : contents.first) { - sizes[i] += graph.get_length(graph.get_handle(node_id)); - } - } - } - } - return sizes; - +function SupportBasedSnarlCaller::get_skip_allele_fn() const { + // port over cutoff used in old support caller (there avg support used all the time, here + // we use the same toggles as when genotyping) + return [&](const SnarlTraversal& trav) -> bool { + return support_val(support_finder.get_traversal_support(trav)) < min_alt_path_support; + }; } -size_t SupportBasedSnarlCaller::get_average_traversal_support_switch_threshold() const { - return average_traversal_support_switch_threshold; -} int SupportBasedSnarlCaller::get_min_total_support_for_call() const { return min_total_support_for_call; } +const TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const { + return support_finder; +} + + double SupportBasedSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, int second_best_trav, int ref_trav_idx) const { bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || @@ -653,76 +429,7 @@ double SupportBasedSnarlCaller::get_bias(const vector& traversal_sizes, int return bias_limit; } -unordered_map SupportBasedSnarlCaller::get_ref_offsets(const SnarlTraversal& ref_trav) const { - unordered_map ref_offsets; - size_t offset = 0; - for (int i = 0; i < ref_trav.visit_size(); ++i) { - const Visit& visit = ref_trav.visit(i); - if (visit.node_id() != 0) { - if (visit.backward()) { - offset += graph.get_length(graph.get_handle(visit.node_id())); - ref_offsets[visit.node_id()] = offset; - } else { - ref_offsets[visit.node_id()] = offset; - offset += graph.get_length(graph.get_handle(visit.node_id())); - } - } - } - return ref_offsets; -} - -PackedSupportSnarlCaller::PackedSupportSnarlCaller(const Packer& packer, SnarlManager& snarl_manager) : - SupportBasedSnarlCaller(*dynamic_cast(packer.get_graph()), snarl_manager), - packer(packer) { -} - -PackedSupportSnarlCaller::~PackedSupportSnarlCaller() { -} -Support PackedSupportSnarlCaller::get_edge_support(const edge_t& edge) const { - return get_edge_support(graph.get_id(edge.first), graph.get_is_reverse(edge.first), - graph.get_id(edge.second), graph.get_is_reverse(edge.second)); -} - -Support PackedSupportSnarlCaller::get_edge_support(id_t from, bool from_reverse, - id_t to, bool to_reverse) const { - Edge proto_edge; - proto_edge.set_from(from); - proto_edge.set_from_start(from_reverse); - proto_edge.set_to(to); - proto_edge.set_to_end(to_reverse); - Support support; - support.set_forward(packer.edge_coverage(proto_edge)); - return support; -} - -Support PackedSupportSnarlCaller::get_min_node_support(id_t node) const { - Position pos; - pos.set_node_id(node); - size_t offset = packer.position_in_basis(pos); - size_t coverage = packer.coverage_at_position(offset); - size_t end_offset = offset + graph.get_length(graph.get_handle(node)); - for (int i = offset + 1; i < end_offset; ++i) { - coverage = min(coverage, packer.coverage_at_position(i)); - } - Support support; - support.set_forward(coverage); - return support; -} - -Support PackedSupportSnarlCaller::get_avg_node_support(id_t node) const { - Position pos; - pos.set_node_id(node); - size_t offset = packer.position_in_basis(pos); - size_t coverage = 0; - size_t length = graph.get_length(graph.get_handle(node)); - for (int i = 0; i < length; ++i) { - coverage += packer.coverage_at_position(offset + i); - } - Support support; - support.set_forward((double)coverage / (double)length); - return support; -} } diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 4c0edaf6e98..40b4bab6784 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -11,7 +11,7 @@ #include "handle.hpp" #include "snarls.hpp" #include "genotypekit.hpp" -#include "packer.hpp" +#include "traversal_support.hpp" namespace vg { @@ -51,29 +51,14 @@ class SnarlCaller { */ class SupportBasedSnarlCaller : public SnarlCaller { public: - SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager); + SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder); virtual ~SupportBasedSnarlCaller(); /// Set some of the parameters void set_het_bias(double het_bias, double ref_het_bias = 0.); void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support); - /// Support of an edge - virtual Support get_edge_support(const edge_t& edge) const = 0; - virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const = 0; - - /// Effective length of an edge - virtual int64_t get_edge_length(const edge_t& edge, const unordered_map& ref_offsets) const; - - /// Minimum support of a node - virtual Support get_min_node_support(id_t node) const = 0; - - /// Average support of a node - virtual Support get_avg_node_support(id_t node) const = 0; - - /// Use node or edge support as proxy for child support (as was done in original calling code) - virtual tuple get_child_support(const Snarl& snarl) const; - /// Get the genotype of a site virtual vector genotype(const Snarl& snarl, const vector& traversals, @@ -93,31 +78,12 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Use min_alt_path_support threshold as cutoff virtual function get_skip_allele_fn() const; - /// Get the support of a traversal - /// Child snarls are handled as in the old call code: their maximum support is used - virtual Support get_traversal_support(const SnarlTraversal& traversal) const; - - /// Get the support of a set of traversals. Any support overlapping traversals in shared_travs - /// will have their support split. If exclusive_only is true, then any split support gets - /// rounded down to 0 (and ignored when computing mins or averages) . - /// exclusive_count is like exclusive only except shared traversals will be counted (as 0) - /// when doing average and min support - /// if the ref_trav_idx is given, it will be used for computing (deletion) edge lengths - virtual vector get_traversal_set_support(const vector& traversals, - const vector& shared_travs, - bool exclusive_only, - bool exclusive_count, - int ref_trav_idx = -1) const; - - /// Get the total length of all nodes in the traversal - virtual vector get_traversal_sizes(const vector& traversals) const; - - /// Get the average traversal support thresholdek - virtual size_t get_average_traversal_support_switch_threshold() const; - /// Get the minimum total support for call virtual int get_min_total_support_for_call() const; + /// Get the traversal support finder + const TraversalSupportFinder& get_support_finder() const; + protected: /// Get the best support out of a list of supports, ignoring skips @@ -162,12 +128,6 @@ class SupportBasedSnarlCaller : public SnarlCaller { size_t min_site_depth = 3; /// what's the min log likelihood for allele depth assignments to PASS? double min_ad_log_likelihood_for_filter = -9; - /// Use average instead of minimum support when determining a traversal's support - /// its node and edge supports. - size_t average_traversal_support_switch_threshold = 50; - /// Use average instead of minimum support when determining a node's support - /// its position supports. - size_t average_node_support_switch_threshold = 50; /// used only for pruning alleles in the VCFTraversalFinder: minimum support /// of an allele's alt-path for it to be considered in the brute-force enumeration double min_alt_path_support = 0.2; @@ -176,35 +136,10 @@ class SupportBasedSnarlCaller : public SnarlCaller { SnarlManager& snarl_manager; - // todo: background support - + TraversalSupportFinder& support_finder; }; -/** - * Get the read support from a Packer object - */ -class PackedSupportSnarlCaller : public SupportBasedSnarlCaller { -public: - PackedSupportSnarlCaller(const Packer& packer, SnarlManager& snarl_manager); - virtual ~PackedSupportSnarlCaller(); - - /// Support of an edge - virtual Support get_edge_support(const edge_t& edge) const; - virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const; - - /// Minimum support of a node - virtual Support get_min_node_support(id_t node) const; - - /// Average support of a node - virtual Support get_avg_node_support(id_t node) const; - -protected: - - /// Derive supports from this pack index - const Packer& packer; -}; - // debug helpers inline string to_string(const HandleGraph& graph, handle_t handle) { return std::to_string(graph.get_id(handle)) + ":" + std::to_string(graph.get_is_reverse(handle)); diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index 82df0928571..c2ebac17dda 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -248,11 +248,16 @@ int main_call(int argc, char** argv) { // Make a Packed Support Caller unique_ptr packer; + unique_ptr support_finder; if (!pack_filename.empty()) { // Load our packed supports (they must have come from vg pack on graph) packer = unique_ptr(new Packer(graph)); packer->load_from_file(pack_filename); - PackedSupportSnarlCaller* packed_caller = new PackedSupportSnarlCaller(*packer, *snarl_manager); + // Make a packed traversal support finder + PackedTraversalSupportFinder* packed_support_finder = new PackedTraversalSupportFinder(*packer, *snarl_manager); + support_finder = unique_ptr(packed_support_finder); + // Make a support caller + SupportBasedSnarlCaller* packed_caller = new SupportBasedSnarlCaller(*graph, *snarl_manager, *packed_support_finder); if (het_bias >= 0) { packed_caller->set_het_bias(het_bias, ref_het_bias); } @@ -263,7 +268,7 @@ int main_call(int argc, char** argv) { } if (!snarl_caller) { - cerr << "error [vg call]: pack file (-p) is required" << endl; + cerr << "error [vg call]: pack file (-k) is required" << endl; return 1; } diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp new file mode 100644 index 00000000000..9372a995a73 --- /dev/null +++ b/src/traversal_support.cpp @@ -0,0 +1,321 @@ +#include "traversal_support.hpp" +#include "genotypekit.hpp" + +//#define debug + +namespace vg { + +TraversalSupportFinder::TraversalSupportFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager) : + graph(graph), + snarl_manager(snarl_manager) { +} + +TraversalSupportFinder::~TraversalSupportFinder() { + +} + +int64_t TraversalSupportFinder::get_edge_length(const edge_t& edge, const unordered_map& ref_offsets) const { + int len = -1; + // use our reference traversal to try to come up with a deletion length for our edge + // idea: if our edge corresponds to a huge deltion, it should be weighted accordingly + auto s_it = ref_offsets.find(graph.get_id(edge.first)); + auto e_it = ref_offsets.find(graph.get_id(edge.second)); + if (s_it != ref_offsets.end() && e_it != ref_offsets.end()) { + size_t start_offset = s_it->second; + if (!graph.get_is_reverse(edge.first)) { + start_offset += graph.get_length(edge.first); + } + size_t end_offset = e_it->second; + if (graph.get_is_reverse(edge.second)) { + end_offset += graph.get_length(edge.second); + } + if (start_offset > end_offset) { + std::swap(start_offset, end_offset); + } + len = end_offset - start_offset; + } + return std::max(len, 1); +} + +tuple TraversalSupportFinder::get_child_support(const Snarl& snarl) const { + // port over old functionality from support caller + // todo: do we need to flag nodes as covered like it does? + pair, unordered_set > contents = snarl_manager.deep_contents(&snarl, graph, true); + Support child_max_support; + Support child_total_support; + size_t child_size = 0; + for (id_t node_id : contents.first) { + Support child_support = get_avg_node_support(node_id); + child_max_support = support_max(child_max_support, child_support); + child_size += graph.get_length(graph.get_handle(node_id)); + child_total_support += child_support; + } + Support child_avg_support = child_total_support / child_size; + // we always use child_max like the old support_caller. + // this is the only way to get top-down recursion to work in many cases + // todo: fix to use bottom up, get get support from actual traversals + // every time!! + return std::tie(child_max_support, child_max_support, child_size); +} + + +Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& traversal) const { + return get_traversal_set_support({traversal}, {}, false, false).at(0); +} + +vector TraversalSupportFinder::get_traversal_set_support(const vector& traversals, + const vector& shared_travs, + bool exclusive_only, + bool exclusive_count, + int ref_trav_idx) const { + + // pass 1: how many times have we seen a node or edge + unordered_map node_counts; + unordered_map edge_counts; + map child_counts; + + for (auto trav_idx : shared_travs) { + const SnarlTraversal& trav = traversals[trav_idx]; + for (int i = 0; i < trav.visit_size(); ++i) { + const Visit& visit = trav.visit(i); + if (visit.node_id() != 0) { + // Count the node once + if (node_counts.count(visit.node_id())) { + node_counts[visit.node_id()] += 1; + } else { + node_counts[visit.node_id()] = 1; + } + } else { + // Count the child once + if (child_counts.count(visit.snarl())) { + child_counts[visit.snarl()] += 1; + } else { + child_counts[visit.snarl()] = 1; + } + } + // note: there is no edge between adjacent snarls as they overlap + // on their endpoints. + if (i > 0 && (trav.visit(i - 1).node_id() != 0 || trav.visit(i).node_id() != 0)) { + edge_t edge = to_edge(graph, trav.visit(i - 1), visit); + // Count the edge once + if (edge_counts.count(edge)) { + edge_counts[edge] += 1; + } else { + edge_counts[edge] = 1; + } + } + } + } + + // pass 1.5: get index for looking up deletion edge lengths (so far we aren't dependent + // on having anything but a path handle graph, so we index on the fly) + unordered_map ref_offsets; + if (ref_trav_idx >= 0) { + ref_offsets = get_ref_offsets(traversals[ref_trav_idx]); + } + + // pass 2: get the supports + // we compute the various combinations of min/avg node/trav supports as we don't know which + // we will need until all the sizes are known + Support max_support; + max_support.set_forward(numeric_limits::max()); + vector min_supports_min(traversals.size(), max_support); // use min node support + vector min_supports_avg(traversals.size(), max_support); // use avg node support + vector has_support(traversals.size(), false); + vector tot_supports_min(traversals.size()); // weighted by lengths, using min node support + vector tot_supports_avg(traversals.size()); // weighted by lengths, using avg node support + vector tot_sizes(traversals.size(), 0); // to compute average from to_supports; + vector tot_sizes_all(traversals.size(), 0); // as above, but includes excluded lengths + int max_trav_size = 0; // size of longest traversal + + bool count_end_nodes = false; // toggle to include snarl ends + + auto update_support = [&] (int trav_idx, const Support& min_support, + const Support& avg_support, int length, int share_count) { + // keep track of overall size of longest traversal + tot_sizes_all[trav_idx] += length; + max_trav_size = std::max(tot_sizes_all[trav_idx], max_trav_size); + + // apply the scaling + double scale_factor = ((exclusive_only || exclusive_count) && share_count > 0) ? 0. : 1. / (1. + share_count); + + // when looking at exclusive support, we don't normalize by skipped lengths + if (scale_factor != 0 || !exclusive_only || exclusive_count) { + has_support[trav_idx] = true; + Support scaled_support_min = min_support * scale_factor; + Support scaled_support_avg = avg_support * scale_factor; + + tot_supports_min[trav_idx] += scaled_support_min; + tot_supports_avg[trav_idx] += scaled_support_avg * length; + tot_sizes[trav_idx] += length; + min_supports_min[trav_idx] = support_min(min_supports_min[trav_idx], scaled_support_min); + min_supports_avg[trav_idx] = support_min(min_supports_avg[trav_idx], scaled_support_avg * length); + } + }; + + for (int trav_idx = 0; trav_idx < traversals.size(); ++trav_idx) { + const SnarlTraversal& trav = traversals[trav_idx]; + for (int visit_idx = 0; visit_idx < trav.visit_size(); ++visit_idx) { + const Visit& visit = trav.visit(visit_idx); + Support min_support; + Support avg_support; + int64_t length; + int share_count = 0; + + if (visit.node_id() != 0) { + // get the node support + min_support = get_min_node_support(visit.node_id()); + avg_support = get_avg_node_support(visit.node_id()); + length = graph.get_length(graph.get_handle(visit.node_id())); + if (node_counts.count(visit.node_id())) { + share_count = node_counts[visit.node_id()]; + } + } else { + // get the child support + tie(min_support, avg_support, length) = get_child_support(visit.snarl()); + if (child_counts.count(visit.snarl())) { + share_count = child_counts[visit.snarl()]; + } + } + if (count_end_nodes || (visit_idx > 0 && visit_idx < trav.visit_size() - 1)) { + update_support(trav_idx, min_support, avg_support, length, share_count); + } + share_count = 0; + + if (visit_idx > 0 && (trav.visit(visit_idx - 1).node_id() != 0 || trav.visit(visit_idx).node_id() != 0)) { + // get the edge support + edge_t edge = to_edge(graph, trav.visit(visit_idx - 1), visit); + min_support = get_edge_support(edge); + length = get_edge_length(edge, ref_offsets); + if (edge_counts.count(edge)) { + share_count = edge_counts[edge]; + } + update_support(trav_idx, min_support, min_support, length, share_count); + } + } + } + + // correct for case where no exclusive support found + for (int i = 0; i < min_supports_min.size(); ++i) { + if (!has_support[i]) { + min_supports_min[i] = Support(); + min_supports_avg[i] = Support(); + } + } + + bool use_avg_trav_support = max_trav_size >= average_traversal_support_switch_threshold; + bool use_avg_node_support = max_trav_size >= average_node_support_switch_threshold; + + if (use_avg_trav_support) { + vector& tot_supports = use_avg_node_support ? tot_supports_avg : tot_supports_min; + for (int i = 0; i < tot_supports.size(); ++i) { + if (tot_sizes[i] > 0) { + tot_supports[i] /= (double)tot_sizes[i]; + } else { + tot_supports[i] = Support(); + } + } + return tot_supports; + } else { + return use_avg_node_support ? min_supports_avg : min_supports_min; + } +} + +vector TraversalSupportFinder::get_traversal_sizes(const vector& traversals) const { + vector sizes(traversals.size(), 0); + for (int i = 0; i < traversals.size(); ++i) { + for (int j = 0; j < traversals[i].visit_size(); ++j) { + if (traversals[i].visit(j).node_id() != 0) { + sizes[i] += graph.get_length(graph.get_handle(traversals[i].visit(j).node_id())); + } else { + // just summing up the snarl contents, which isn't a great heuristic but will + // help in some cases + pair, unordered_set > contents = snarl_manager.deep_contents( + snarl_manager.into_which_snarl(traversals[i].visit(j)), graph, true); + for (id_t node_id : contents.first) { + sizes[i] += graph.get_length(graph.get_handle(node_id)); + } + } + } + } + return sizes; + +} + +size_t TraversalSupportFinder::get_average_traversal_support_switch_threshold() const { + return average_traversal_support_switch_threshold; +} + +unordered_map TraversalSupportFinder::get_ref_offsets(const SnarlTraversal& ref_trav) const { + unordered_map ref_offsets; + size_t offset = 0; + for (int i = 0; i < ref_trav.visit_size(); ++i) { + const Visit& visit = ref_trav.visit(i); + if (visit.node_id() != 0) { + if (visit.backward()) { + offset += graph.get_length(graph.get_handle(visit.node_id())); + ref_offsets[visit.node_id()] = offset; + } else { + ref_offsets[visit.node_id()] = offset; + offset += graph.get_length(graph.get_handle(visit.node_id())); + } + } + } + return ref_offsets; +} + +PackedTraversalSupportFinder::PackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager) : + TraversalSupportFinder(*dynamic_cast(packer.get_graph()), snarl_manager), + packer(packer) { +} + +PackedTraversalSupportFinder::~PackedTraversalSupportFinder() { +} + +Support PackedTraversalSupportFinder::get_edge_support(const edge_t& edge) const { + return get_edge_support(graph.get_id(edge.first), graph.get_is_reverse(edge.first), + graph.get_id(edge.second), graph.get_is_reverse(edge.second)); +} + +Support PackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse, + id_t to, bool to_reverse) const { + Edge proto_edge; + proto_edge.set_from(from); + proto_edge.set_from_start(from_reverse); + proto_edge.set_to(to); + proto_edge.set_to_end(to_reverse); + Support support; + support.set_forward(packer.edge_coverage(proto_edge)); + return support; +} + +Support PackedTraversalSupportFinder::get_min_node_support(id_t node) const { + Position pos; + pos.set_node_id(node); + size_t offset = packer.position_in_basis(pos); + size_t coverage = packer.coverage_at_position(offset); + size_t end_offset = offset + graph.get_length(graph.get_handle(node)); + for (int i = offset + 1; i < end_offset; ++i) { + coverage = min(coverage, packer.coverage_at_position(i)); + } + Support support; + support.set_forward(coverage); + return support; +} + +Support PackedTraversalSupportFinder::get_avg_node_support(id_t node) const { + Position pos; + pos.set_node_id(node); + size_t offset = packer.position_in_basis(pos); + size_t coverage = 0; + size_t length = graph.get_length(graph.get_handle(node)); + for (int i = 0; i < length; ++i) { + coverage += packer.coverage_at_position(offset + i); + } + Support support; + support.set_forward((double)coverage / (double)length); + return support; +} + + +} diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp new file mode 100644 index 00000000000..1b910ed3a3c --- /dev/null +++ b/src/traversal_support.hpp @@ -0,0 +1,113 @@ +#ifndef VG_SUPPORT_FINDER_HPP_INCLUDED +#define VG_SUPPORT_FINDER_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include "handle.hpp" +#include "snarls.hpp" +#include "genotypekit.hpp" +#include "packer.hpp" + +namespace vg { + +using namespace std; + + +/** + * Get the read support of snarl traversals or sets of snarl traversals + */ +class TraversalSupportFinder { +public: + TraversalSupportFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager); + virtual ~TraversalSupportFinder(); + + /// Support of an edge + virtual Support get_edge_support(const edge_t& edge) const = 0; + virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const = 0; + + /// Effective length of an edge + virtual int64_t get_edge_length(const edge_t& edge, const unordered_map& ref_offsets) const; + + /// Minimum support of a node + virtual Support get_min_node_support(id_t node) const = 0; + + /// Average support of a node + virtual Support get_avg_node_support(id_t node) const = 0; + + /// Use node or edge support as proxy for child support (as was done in original calling code) + virtual tuple get_child_support(const Snarl& snarl) const; + + /// Get the support of a traversal + /// Child snarls are handled as in the old call code: their maximum support is used + virtual Support get_traversal_support(const SnarlTraversal& traversal) const; + + /// Get the support of a set of traversals. Any support overlapping traversals in shared_travs + /// will have their support split. If exclusive_only is true, then any split support gets + /// rounded down to 0 (and ignored when computing mins or averages) . + /// exclusive_count is like exclusive only except shared traversals will be counted (as 0) + /// when doing average and min support + /// if the ref_trav_idx is given, it will be used for computing (deletion) edge lengths + virtual vector get_traversal_set_support(const vector& traversals, + const vector& shared_travs, + bool exclusive_only, + bool exclusive_count, + int ref_trav_idx = -1) const; + + /// Get the total length of all nodes in the traversal + virtual vector get_traversal_sizes(const vector& traversals) const; + + /// Get the average traversal support thresholdek + virtual size_t get_average_traversal_support_switch_threshold() const; + + /// Relic from old code + static double support_val(const Support& support) { return total(support); }; + + /// get a map of the beginning of a node (in forward orientation) on a traversal + /// used for up-weighting large deletion edges in complex snarls with average support + unordered_map get_ref_offsets(const SnarlTraversal& ref_trav) const; + +protected: + + size_t average_traversal_support_switch_threshold = 50; + /// Use average instead of minimum support when determining a node's support + /// its position supports. + size_t average_node_support_switch_threshold = 50; + + const PathHandleGraph& graph; + + SnarlManager& snarl_manager; + +}; + +/** + * Get the read support from a Packer object + */ +class PackedTraversalSupportFinder : public TraversalSupportFinder { +public: + PackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager); + virtual ~PackedTraversalSupportFinder(); + + /// Support of an edge + virtual Support get_edge_support(const edge_t& edge) const; + virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const; + + /// Minimum support of a node + virtual Support get_min_node_support(id_t node) const; + + /// Average support of a node + virtual Support get_avg_node_support(id_t node) const; + +protected: + + /// Derive supports from this pack index + const Packer& packer; +}; + +} + +#endif From c4bdd4508879491ca56c724570a23cbf0cbf0c90 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 5 Nov 2019 17:36:25 -0500 Subject: [PATCH 25/79] move tmpfile init to packer construction to avoid race condition --- src/packer.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/packer.cpp b/src/packer.cpp index 6f14e79dd46..c3182661776 100644 --- a/src/packer.cpp +++ b/src/packer.cpp @@ -68,7 +68,11 @@ Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins, if (bin_size) { n_bins = num_bases_dynamic / bin_size + 1; } - tmpfstream_locks = new std::mutex[n_bins]; + if (record_edits) { + tmpfstream_locks = new std::mutex[n_bins]; + // open tmpfile if needed + ensure_edit_tmpfiles_open(); + } // speed up quality computation if necessary for (size_t i = 0; i < get_thread_count(); ++i) { @@ -385,8 +389,6 @@ void Packer::add(const Alignment& aln, int min_mapq, int min_baseq , bool qual_a if (aln.mapping_quality() < min_mapq) { return; } - // open tmpfile if needed - ensure_edit_tmpfiles_open(); // count the nodes, edges, and edits Mapping prev_mapping; bool has_prev_mapping = false; From 781b38b1237c89c24c8509924b99e2b78df87c84 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 6 Nov 2019 09:01:14 -0500 Subject: [PATCH 26/79] forgotten init --- src/packer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/packer.cpp b/src/packer.cpp index c3182661776..7b7055a2e2f 100644 --- a/src/packer.cpp +++ b/src/packer.cpp @@ -63,6 +63,7 @@ Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins, // mutexes for coverage base_locks = new std::mutex[coverage_dynamic.size()]; edge_locks = new std::mutex[edge_coverage_dynamic.size()]; + tmpfstream_locks = nullptr; // count the bins if binning if (bin_size) { From 3fcca3c26e70504a82e359dd7b1b94a521114702 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Wed, 6 Nov 2019 17:20:14 -0800 Subject: [PATCH 27/79] Made things compile --- src/minimizer_mapper.cpp | 2 +- src/seed_clusterer.cpp | 1053 +++++++++++++++++-------------- src/seed_clusterer.hpp | 58 +- src/subcommand/cluster_main.cpp | 3 +- src/unittest/seed_clusterer.cpp | 385 +++++------ 5 files changed, 816 insertions(+), 685 deletions(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index 5a028a137d1..8b11b1f461d 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -187,7 +187,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { } // Cluster the seeds. Get sets of input seed indexes that go together. - vector> clusters paired_clusters = clusterer.cluster_seeds(seeds, distance_limit); + vector> clusters = clusterer.cluster_seeds(seeds, distance_limit); if (track_provenance) { funnel.substage("score"); diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index c05d75efbd7..ede84c05f36 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -2,7 +2,7 @@ #include -//#define DEBUG +#define DEBUG_CLUSTER namespace vg { @@ -10,22 +10,22 @@ namespace vg { dist_index(dist_index){ }; - vector> cluster_seeds (vector seeds, int64_t read_distance_limit) const { + SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (vector seeds, int64_t read_distance_limit) const { vector> all_seeds; all_seeds.push_back(std::move(seeds)); - tuple>>,vector>> all_clusters = - cluster_seeds(all_seeds, distance_limit); + tuple>>,vector>> all_clusters = + cluster_seeds(all_seeds, read_distance_limit, 0); return std::get<0>(all_clusters)[0]; }; - tuple>>,vector>> SnarlSeedClusterer::cluster_seeds ( + tuple,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds ( vector> all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit) const { /* Given a vector of seeds and a limit, find a clustering of seeds where * seeds that are closer than the limit cluster together. * Returns a vector of cluster assignments */ -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << endl << "New cluster calculation:" << endl; #endif if (fragment_distance_limit != 0 && @@ -45,7 +45,9 @@ cerr << endl << "New cluster calculation:" << endl; //This stores all the tree relationships and cluster information //for a single level of the snarl tree as it is being processed //It also keeps track of the parents of the current level - TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit); + size_t seed_count = 0; + for (auto& v : all_seeds) seed_count+= v.size(); + TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit, seed_count); //Populate tree_state.node_to_seeds (mapping each node to the seeds it //contains) and snarl_to_nodes_by_level @@ -85,23 +87,38 @@ cerr << endl << "New cluster calculation:" << endl; tree_state.chain_to_snarls.clear(); } -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Found read clusters : " << endl; - for (auto group : tree_state.read_union_find.all_groups()){ - for (size_t c : group) { - cerr << tree_state.seeds->at(c) << " "; + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + cerr << "\t read num " << read_num << ": " ; + for (auto group : tree_state.read_union_find[read_num].all_groups()){ + cerr << "\t\t"; + for (size_t c : group) { + cerr << tree_state.all_seeds->at(read_num)[c] << " "; + } + cerr << endl; + } + } + vector ordered_seeds; + for (size_t i = 0 ; i < tree_state.all_seeds->size() ; i++) { + for (auto x : tree_state.all_seeds->at(i)) { + ordered_seeds.push_back(x); } - cerr << endl; } cerr << "Found fragment clusters : " << endl; for (auto group : tree_state.fragment_union_find.all_groups()){ + cerr << "\t"; for (size_t c : group) { - cerr << tree_state.seeds->at(c) << " "; + cerr << ordered_seeds[c] << " "; } cerr << endl; } #endif - return make_tuple(tree_state.read_union_find.all_groups(), + vector>> read_clusters; + for (auto& uf : tree_state.read_union_find) { + read_clusters.emplace_back(uf.all_groups()); + } + return make_tuple(std::move(read_clusters), tree_state.fragment_union_find.all_groups()); }; @@ -116,27 +133,29 @@ cerr << endl << "New cluster calculation:" << endl; snarl_to_nodes) const { // Assign each seed to a node. - tree_state.node_to_seeds.reserve(tree_state.all_seeds->size()); - for (size_t i = 0; i < tree_state.seeds->size(); i++) { - for (size_t j = 0 ; j < tree_state.all_seeds[i].size() ; j++) { - id_t id = get_id(tree_state.all_seeds->at(i)->at(j)); - tree_state.node_to_seeds.emplace_back(id, i, j); + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ + vector& seeds = tree_state.all_seeds->at(read_num); + for (size_t i = 0; i < seeds.size(); i++) { + id_t id = get_id(seeds.at(i)); + tree_state.node_to_seeds[read_num].emplace_back(id, i); //For each seed, assign it to a node and the node to a snarl } + std::sort(tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end()); } - std::sort(tree_state.node_to_seeds.begin(), tree_state.node_to_seeds.end()); // Assign each node to a snarl. id_t prev_node = -1; - for (auto mapping : tree_state.node_to_seeds) { - if (get<0>(mapping) == prev_node) { - continue; + for (auto& read_node :tree_state.node_to_seeds) { + for (auto& mapping : read_node) { + if (mapping.first == prev_node) { + continue; + } + prev_node = mapping.first; + size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first); + size_t depth = dist_index.snarl_indexes[snarl_i].depth; + snarl_to_nodes[depth][snarl_i].emplace_back( + NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size())); } - prev_node = mapping.first; - size_t snarl_i = dist_index.getPrimaryAssignment(get<0>(mapping)); - size_t depth = dist_index.snarl_indexes[snarl_i].depth; - snarl_to_nodes[depth][snarl_i].emplace_back( - NetgraphNode(get<0>(mapping), NODE), NodeClusters()); } } @@ -153,7 +172,7 @@ cerr << endl << "New cluster calculation:" << endl; MinimumDistanceIndex::SnarlIndex& snarl_index = dist_index.snarl_indexes[snarl_i]; -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "At depth " << depth << " snarl number " << snarl_i << " headed by " << snarl_index.id_in_parent << " with children " << endl; @@ -175,7 +194,7 @@ cerr << endl << "New cluster calculation:" << endl; chain_rank, make_pair(snarl_i, cluster_one_snarl(tree_state, snarl_i))); -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Recording snarl number " << snarl_i << " headed by " << snarl_index.id_in_parent << " as a child of chain number " << chain_assignment << " headed by " << snarl_index.parent_id << endl; @@ -187,7 +206,7 @@ cerr << endl << "New cluster calculation:" << endl; if (depth != 0 && snarl_index.parent_id != 0){ //If this has a parent, record it -#ifdef DEBUG +#ifdef DEBUG_CLUSTER assert(snarl_index.parent_id >= dist_index.min_node_id); assert(snarl_index.parent_id <= dist_index.max_node_id); #endif @@ -199,7 +218,7 @@ cerr << endl << "New cluster calculation:" << endl; NetgraphNode (snarl_i, SNARL), cluster_one_snarl(tree_state, snarl_i)); -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Recording snarl number " << snarl_i << " headed by " << snarl_index.id_in_parent << " as a child of snarl number " << parent_snarl_i @@ -225,7 +244,7 @@ cerr << endl << "New cluster calculation:" << endl; // Get the chain's number size_t chain_i = kv.first; -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "At depth " << depth << " chain number " << chain_i << " with children " << endl; for (auto it2 : kv.second) { @@ -242,7 +261,7 @@ cerr << endl << "New cluster calculation:" << endl; // Find the node ID that heads the parent of that chain. size_t parent_id = dist_index.chain_indexes[chain_i].parent_id; // It must be a legitimate node ID we cover. -#ifdef DEBUG +#ifdef DEBUG_CLUSTER assert(parent_id >= dist_index.min_node_id); assert(parent_id <= dist_index.max_node_id); #endif @@ -254,7 +273,7 @@ cerr << endl << "New cluster calculation:" << endl; tree_state.parent_snarl_to_nodes[parent_snarl_i].emplace_back( NetgraphNode (chain_i, CHAIN), std::move(chain_clusters)); -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Recording chain number " << chain_i << " headed by " << dist_index.chain_indexes[chain_i].id_in_parent << " as a child of snarl number " << parent_snarl_i @@ -269,168 +288,216 @@ cerr << endl << "New cluster calculation:" << endl; SnarlSeedClusterer::NodeClusters SnarlSeedClusterer::cluster_one_node( TreeState& tree_state, id_t node_id, int64_t node_length) const { -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Finding clusters on node " << node_id << " which has length " << node_length << endl; #endif /*Find clusters of seeds in this node. - * Returns a hash_set of the union find group IDs of the new clusters, + * Result contains hash_set of the union find group IDs of the new clusters, * and the shortest distance from any seed to the left and right sides * of the node*/ - auto seed_range_start = std::lower_bound( - tree_state.node_to_seeds.begin(), - tree_state.node_to_seeds.end(), - std::tuple(node_id, 0, 0)); - //indices of union find group ids of clusters in this node - NodeClusters node_clusters; + NodeClusters node_clusters(tree_state.all_seeds->size()); if (tree_state.read_distance_limit > node_length) { //If the limit is greater than the node length, then all the //seeds on this node must be in the same cluster - size_t group_id = seed_range_start->second; + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + if (tree_state.node_to_seeds[read_num].size() > 0) { + auto seed_range_start = std::lower_bound( + tree_state.node_to_seeds[read_num].begin(), + tree_state.node_to_seeds[read_num].end(), + std::pair(node_id, 0)); + + size_t group_id = seed_range_start->second; + size_t fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; + + for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { + //For each seed on this node, add it to the cluster + //And find the shortest distance from any seed to both + //ends of the node + + pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; + int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) + : get_offset(seed) + 1; + int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 + : node_length - get_offset(seed); + + node_clusters.read_best_left[read_num] = min_not_minus_one(dist_left, + node_clusters.read_best_left[read_num]); + node_clusters.read_best_right[read_num] = min_not_minus_one(dist_right, + node_clusters.read_best_right[read_num]); + node_clusters.fragment_best_left = min_not_minus_one(dist_left, + node_clusters.fragment_best_left); + node_clusters.fragment_best_right = min_not_minus_one(dist_right, + node_clusters.fragment_best_right); + + tree_state.read_union_find[read_num].union_groups(group_id, iter->second); + if (tree_state.fragment_distance_limit != 0 ) { + tree_state.fragment_union_find.union_groups(fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]); + } - for (auto iter = seed_range_start; iter != tree_state.node_to_seeds.end() && iter->first == node_id; ++iter) { - //For each seed on this node, add it to the cluster - //And find the shortest distance from any seed to both - //ends of the node + } - pos_t seed = tree_state.all_seeds->at(std::get<1>(*iter)).at(std::get<2>(*iter)); - int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) - : get_offset(seed) + 1; - int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 - : node_length - get_offset(seed); + //Record the new cluster + group_id = tree_state.read_union_find[read_num].find_group(group_id); + tree_state.read_cluster_dists[read_num][group_id] = make_pair(node_clusters.read_best_left[read_num], + node_clusters.read_best_right[read_num]); + node_clusters.read_cluster_heads.emplace(read_num, group_id); - node_clusters.best_left = min_not_minus_one(dist_left, - node_clusters.best_left); - node_clusters.best_right = min_not_minus_one(dist_right, - node_clusters.best_right); + if (tree_state.fragment_distance_limit != 0) { + fragment_group_id = tree_state.fragment_union_find.find_group(fragment_group_id); + tree_state.fragment_cluster_dists[fragment_group_id] = make_pair(node_clusters.fragment_best_left, + node_clusters.fragment_best_right); + } +#ifdef DEBUG_CLUSTER + assert (group_id == tree_state.read_union_find[read_num].find_group(group_id)); + cerr << "Found single cluster on node " << node_id << "with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl; + bool got_left = false; + bool got_right = false; + for (pair c : node_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]); + assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]); + assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left); + assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right); + if (dists.first == node_clusters.fragment_best_left) {got_left = true;} + if (dists.second == node_clusters.fragment_best_right) {got_right = true;} + //if (dists.first == node_clusters.read_best_left[read_num]) {got_all_left[read_num] = true;} + //if (dists.second == node_clusters.read_best_right[read_num]) {got_all_right[read_num] = true;} + cerr << "\t" << c.first << ":"<second); - if (tree_state.fragment_distance_limit != 0) { - tree_state.fragment_union_find.union_groups(group_id, iter->second); +#endif } - - } - - //Record the new cluster - group_id = tree_state.read_union_find.find_group(group_id); - tree_state.read_cluster_dists[group_id] = make_pair(node_clusters.best_left, - node_clusters.best_right); - node_clusters.cluster_heads.insert(group_id); -#ifdef DEBUG - assert (group_id == tree_state.read_union_find.find_group(group_id)); - cerr << "Found single cluster on node " << node_id << endl; - bool got_left = false; - bool got_right = false; - for (size_t c : node_clusters.cluster_heads) { - pair dists = tree_state.read_cluster_dists[c]; - assert(dists.first == -1 || dists.first >= node_clusters.best_left); - assert(dists.second == -1 || dists.second >= node_clusters.best_right); - if (dists.first == node_clusters.best_left) {got_left = true;} - if (dists.second == node_clusters.best_right) {got_right = true;} - cerr << "\t" << c << ": left: " << dists.first << " right : " << - dists.second << endl; } - assert(got_left); - assert(got_right); -#endif return node_clusters; } - //Create a vector of seeds with their offsets - vector> seed_offsets; - for (auto iter = seed_range_start; iter != tree_state.node_to_seeds.end() && iter->first == node_id; ++iter) { - //For each seed, find its offset - pos_t seed = tree_state.seeds->at(iter->second); - int64_t offset = is_rev(seed) ? node_length - get_offset(seed) - : get_offset(seed) + 1; - - node_clusters.best_left = min_not_minus_one(offset, - node_clusters.best_left); - node_clusters.best_right = min_not_minus_one(node_length-offset+1, - node_clusters.best_right); - - seed_offsets.emplace_back(iter->second, offset); + vector> seed_offsets; + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + // for all seeds + auto seed_range_start = std::lower_bound( + tree_state.node_to_seeds[read_num].begin(), + tree_state.node_to_seeds[read_num].end(), + std::pair(node_id, 0)); + for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { + //For each seed, find its offset + pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; + int64_t offset = is_rev(seed) ? node_length - get_offset(seed) + : get_offset(seed) + 1; + + node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left); + node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right); + node_clusters.read_best_left[read_num] = min_not_minus_one(offset, node_clusters.read_best_left[read_num]); + node_clusters.read_best_right[read_num] = min_not_minus_one(node_length-offset+1, node_clusters.read_best_right[read_num]); + + seed_offsets.emplace_back(read_num, iter->second, offset); + } } //Sort seeds by their position in the node std::sort(seed_offsets.begin(), seed_offsets.end(), [&](const auto a, const auto b) -> bool { - return a.second < b.second; + return std::get<2>(a) < std::get<2>(b); } ); - int64_t last_offset = 0; int64_t read_last_left = -1; - size_t read_last_cluster = seed_offsets[0].first; - int64_t fragment_last_left = -1; - size_t fragment_last_cluster = seed_offsets[0].first; - node_clusters.cluster_heads.insert(read_last_cluster); + vector read_last_offset (tree_state.all_seeds->size(), -1); + int64_t fragment_last_offset = -1; + size_t fragment_last_cluster = -1; + vector read_last_cluster (tree_state.all_seeds->size(), -1); - for ( pair s : seed_offsets) { + for ( tuple s : seed_offsets) { //For each seed, in order of position in the node, //see if it belongs to a new read/fragment cluster - if it is //close enough to the previous seed + size_t read_num = std::get<0>(s); - if (read_last_left != -1 && - abs(s.second - last_offset) <= tree_state.read_distance_limit) { + if (read_last_offset[read_num] != -1 && + abs(std::get<2>(s) - read_last_offset[read_num]) <= tree_state.read_distance_limit) { + //TODO: Need abs? //If this seed is in the same read cluster as the previous one, //union them - tree_state.read_union_find.union_groups(s.first, read_last_cluster); - read_last_cluster = tree_state.read_union_find.find_group(s.first); - tree_state.read_cluster_dists[read_last_cluster] = make_pair(read_last_left, node_length-s.second+1); + int64_t prev_dist_left = tree_state.read_cluster_dists[read_num][read_last_cluster[read_num]].first; + tree_state.read_union_find[read_num].union_groups(std::get<1>(s), read_last_cluster[read_num]); + read_last_cluster[read_num] = tree_state.read_union_find[read_num].find_group(std::get<1>(s)); + tree_state.read_cluster_dists[read_num][read_last_cluster[read_num]] = + make_pair(prev_dist_left,node_length-std::get<2>(s)+1); + read_last_offset[read_num] = std::get<2>(s); if (tree_state.fragment_distance_limit != 0) { //If we are also clustering paired end reads by fragment distance, //cluster these together - tree_state.fragment_union_find.union_groups(s.first, fragment_last_cluster); - fragment_last_cluster = tree_state.fragment_union_find.find_group(s.first); + int64_t prev_dist_left = tree_state.fragment_cluster_dists[fragment_last_cluster].first; + tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster); + fragment_last_cluster = tree_state.fragment_union_find.find_group(std::get<1>(s)+tree_state.read_index_offsets[read_num]); + tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1); + fragment_last_offset = std::get<2>(s); } } else { - //This becomes a new cluster - node_clusters.cluster_heads.insert(s.first); - read_last_cluster = s.first; - read_last_left = s.second; - tree_state.read_cluster_dists[s.first] = make_pair(read_last_left, node_length - s.second + 1); + //This becomes a new read cluster + if (read_last_cluster[read_num] != -1) { + node_clusters.read_cluster_heads.emplace(read_num, read_last_cluster[read_num]); + } + read_last_cluster[read_num] = std::get<1>(s); + read_last_offset[read_num] = std::get<2>(s); + tree_state.read_cluster_dists[read_num][read_last_cluster[read_num]] = + make_pair(read_last_offset[read_num], node_length - read_last_offset[read_num] + 1); if (tree_state.fragment_distance_limit != 0) { - if (read_last_left != -1 && - abs(s.second - last_offset) <= tree_state.fragment_distance_limit) { + if (fragment_last_offset != -1 && + abs(read_last_offset[read_num] - fragment_last_offset) <= tree_state.fragment_distance_limit) { //If this is a new read cluster but the same fragment cluster - tree_state.fragment_union_find.union_groups(s.first, fragment_last_cluster); - fragment_last_cluster = tree_state.fragment_union_find.find_group(s.first); + int64_t prev_dist_left = tree_state.fragment_cluster_dists[fragment_last_cluster].first; + tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster); + fragment_last_cluster = tree_state.fragment_union_find.find_group(fragment_last_cluster); + tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1); } else { //If this is a new fragment cluster as well - fragment_last_cluster = s.first; - fragment_last_left = s.second; + fragment_last_cluster = std::get<1>(s)+tree_state.read_index_offsets[read_num]; + fragment_last_offset = std::get<2>(s); + tree_state.fragment_cluster_dists[fragment_last_cluster] = + make_pair(fragment_last_offset, node_length-fragment_last_offset+1); } } } - last_offset = s.second; - + } + for (size_t i = 0 ; i < read_last_cluster.size() ; i++) { + node_clusters.read_cluster_heads.emplace(i, read_last_cluster[i]); } -#ifdef DEBUG +#ifdef DEBUG_CLUSTER + cerr << "Found read clusters on node " << node_id << endl; bool got_left = false; bool got_right = false; - for (size_t c : node_clusters.cluster_heads) { - pair dists = tree_state.read_cluster_dists[c]; - assert(dists.first == -1 || dists.first >= node_clusters.best_left); - assert(dists.first == -1 || dists.second >= node_clusters.best_right); - if (dists.first == node_clusters.best_left) {got_left = true;} - if (dists.second == node_clusters.best_right) {got_right = true;} - cerr << "\t" << c << ": left: " << dists.first << " right : " - << dists.second << endl; + + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + for (pair c : node_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]); + assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]); + assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left); + assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right); + if (dists.first == node_clusters.fragment_best_left) {got_left = true;} + if (dists.second == node_clusters.fragment_best_right) {got_right = true;} + cerr << "\t" << c.first << ":"< group_id : node_clusters.read_cluster_heads) { + assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second)); } #endif + return node_clusters; }; @@ -448,51 +515,48 @@ cerr << endl << "New cluster calculation:" << endl; MinimumDistanceIndex::ChainIndex& chain_index = dist_index.chain_indexes[ chain_index_i]; -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Finding clusters on chain number " << chain_index_i << " headed by node " << chain_index.id_in_parent << endl; #endif auto combine_snarl_clusters = [&] (size_t& new_group, size_t& combined_group, size_t& fragment_combined_group, - vector& to_erase, int64_t dist, - pair& dists){ + vector>& to_erase, int64_t fragment_dist,int64_t read_dist, + pair& dists, size_t read_num){ //Helper function to combine clusters of the same snarl //Used when two clusters in the same snarl can be combined by //looping in the chain - if (dist <= tree_state.read_distance_limit) { + if (read_dist <= tree_state.read_distance_limit) { if (combined_group == -1) { combined_group = new_group; } else { //Union the two groups - combined_group = tree_state.read_union_find.find_group( - combined_group); - tree_state.read_union_find.union_groups(combined_group, - new_group); + combined_group = tree_state.read_union_find[read_num].find_group(combined_group); + tree_state.read_union_find[read_num].union_groups(combined_group, new_group); //Find the new distances of the combined groups pair& old_dists = - tree_state.read_cluster_dists[combined_group]; - size_t new_combined_group = - tree_state.read_union_find.find_group(new_group); + tree_state.read_cluster_dists[read_num][combined_group]; + size_t new_combined_group = tree_state.read_union_find[read_num].find_group(new_group); //Update which groups are being kept track of if (new_combined_group != new_group) { - to_erase.push_back(new_group); + to_erase.emplace_back(read_num, new_group); } if (new_combined_group != combined_group) { - to_erase.push_back(combined_group); + to_erase.emplace_back(read_num, combined_group); } combined_group = new_combined_group; dists = make_pair( min_not_minus_one(old_dists.first, dists.first), min_not_minus_one(old_dists.second, dists.second)); - tree_state.read_cluster_dists[new_group] = dists; - tree_state.read_cluster_dists[combined_group] = dists; -#ifdef DEBUG - cerr << " New dists: " - << tree_state.read_cluster_dists[combined_group].first << " " - << tree_state.read_cluster_dists[combined_group].second << endl; + tree_state.read_cluster_dists[read_num][new_group] = dists; + tree_state.read_cluster_dists[read_num][combined_group] = dists; +#ifdef DEBUG_CLUSTER + cerr << " New dists for read num " << read_num << ": " + << tree_state.read_cluster_dists[read_num][combined_group].first << " " + << tree_state.read_cluster_dists[read_num][combined_group].second << endl; #endif } @@ -500,12 +564,13 @@ cerr << endl << "New cluster calculation:" << endl; if (fragment_combined_group != -1) { //If we're keeping track of fragment clusters, union this tree_state.fragment_union_find.union_groups(fragment_combined_group, - new_group); + new_group + tree_state.read_index_offsets[read_num]); } - fragment_combined_group = tree_state.fragment_union_find.find_group(new_group); + fragment_combined_group = tree_state.fragment_union_find.find_group( + new_group + tree_state.read_index_offsets[read_num]); } } else if (tree_state.fragment_distance_limit != 0 && - dist <= tree_state.fragment_distance_limit) { + fragment_dist <= tree_state.fragment_distance_limit) { //If these aren't in the same read cluster but are in //the same fragment cluster if (fragment_combined_group == -1) { @@ -519,7 +584,7 @@ cerr << endl << "New cluster calculation:" << endl; }; //The clusters of the chain that are built from the snarl clusters //This will get updated as we traverse through the snarls - NodeClusters chain_clusters; + NodeClusters chain_clusters(tree_state.all_seeds->size()); //The rank of the node at which the chain clusters reach // (the last snarl that was traversed) @@ -572,13 +637,17 @@ cerr << endl << "New cluster calculation:" << endl; make_pair(start_rank, false), last_len, start_length); offset = offset - last_len + start_length; - for (size_t i : chain_clusters.cluster_heads) { - tree_state.read_cluster_dists[i].second = - tree_state.read_cluster_dists[i].second == -1 - ? -1 : tree_state.read_cluster_dists[i].second + offset; + for (pair c : chain_clusters.read_cluster_heads) { + tree_state.read_cluster_dists[c.first][c.second].second = + tree_state.read_cluster_dists[c.first][c.second].second == -1 + ? -1 : tree_state.read_cluster_dists[c.first][c.second].second + offset; + } + chain_clusters.fragment_best_right = chain_clusters.fragment_best_right == -1 ? -1 + : chain_clusters.fragment_best_right + offset; + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + chain_clusters.read_best_right[read_num] = chain_clusters.read_best_right[read_num] == -1 ? -1 + : chain_clusters.read_best_right[read_num] + offset; } - chain_clusters.best_right = chain_clusters.best_right == -1 ? -1 - : chain_clusters.best_right + offset; } last_rank = start_rank + 1; @@ -596,19 +665,19 @@ cerr << endl << "New cluster calculation:" << endl; int64_t loop_dist_start = chain_index.loop_rev[start_rank] - 1; -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Looking at snarl rank " << start_rank << " representing " << snarl_index.id_in_parent << endl; - cerr << " Snarl distance limits: " << snarl_clusters.best_left - << " " << snarl_clusters.best_right << endl; + cerr << " Snarl fragment distance limits: " << snarl_clusters.fragment_best_left + << " " << snarl_clusters.fragment_best_right << endl; cerr << " Snarl clusters to add: " << endl; - for (size_t c : snarl_clusters.cluster_heads) { - pair dists = tree_state.read_cluster_dists[c]; - cerr << "\tleft: " << dists.first << " right : " << dists.second + for (pair c : snarl_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + cerr << "\tread " << c.first << ",cluster " << c.second << " left: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t"; - for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) { - if (tree_state.read_union_find.find_group(x) == c) { - cerr << tree_state.seeds->at(x) << " "; + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; } } cerr << endl; @@ -617,16 +686,16 @@ cerr << endl << "New cluster calculation:" << endl; cerr << " Clusters on chain: " << endl; - cerr << " best left: " << chain_clusters.best_left << " best right: " - << chain_clusters.best_right << endl; - for (size_t c : chain_clusters.cluster_heads) { - pair dists = tree_state.read_cluster_dists[c]; + cerr << " best left: " << chain_clusters.fragment_best_left << " best right: " + << chain_clusters.fragment_best_right << endl; + for (pair c : chain_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; cerr << "\tleft: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t"; - for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) { - if (tree_state.read_union_find.find_group(x) == c) { - cerr << tree_state.seeds->at(x) << " "; + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; } } cerr << endl; @@ -637,31 +706,35 @@ cerr << endl << "New cluster calculation:" << endl; //Need to remember this to check if snarl clusters overlap the old //best distance - int64_t old_chain_right = chain_clusters.best_right; + int64_t fragment_chain_right = chain_clusters.fragment_best_right; + vector read_chain_right = std::move(chain_clusters.read_best_right); - vector to_add;//new cluster group ids from snarl clusters - vector to_erase; //old cluster group ids + vector> to_add;//new cluster group ids from snarl clusters + vector> to_erase; //old cluster group ids //New cluster- there will be at most one new cluster to add - size_t combined_cluster = -1; + vector< size_t> combined_cluster (tree_state.all_seeds->size(), -1); size_t fragment_combined_cluster = -1; - int64_t combined_left = -1; int64_t combined_right = -1; + vector combined_left (tree_state.all_seeds->size(), -1); + vector combined_right (tree_state.all_seeds->size(), -1); //Combined snarl clusters by taking chain loop left/right - size_t snarl_cluster_left = -1; - size_t snarl_cluster_right = -1; + vector snarl_cluster_left (tree_state.all_seeds->size(),-1); + vector snarl_cluster_right (tree_state.all_seeds->size(), -1); size_t fragment_snarl_cluster_left = -1; size_t fragment_snarl_cluster_right = -1; - chain_clusters.best_left = -1; chain_clusters.best_right = -1; - for (size_t j : snarl_clusters.cluster_heads) { + chain_clusters.fragment_best_right = -1; + chain_clusters.read_best_right.assign(tree_state.all_seeds->size(), -1); + for (pair cluster_head : snarl_clusters.read_cluster_heads) { // For each of the clusters for the current snarl, // first check if it can be combined with any other // snarl clusters by taking loops in the chain, // then, find if it belongs to the new combined cluster // that includes chain clusters + size_t read_num = cluster_head.first; pair snarl_dists = - std::move(tree_state.read_cluster_dists[j]); + std::move(tree_state.read_cluster_dists[read_num][cluster_head.second]); if (loop_dist_start != -1) { //If there is a loop going out and back into the start of @@ -671,183 +744,177 @@ cerr << endl << "New cluster calculation:" << endl; //The distance to the right side of the snarl // that is found by taking the leftmost seed and // looping through the chain to the left - int64_t new_right = - snarl_dists.first == -1 || loop_dist_start == -1 + int64_t new_right = snarl_dists.first == -1 || loop_dist_start == -1 ? -1 - : snarl_dists.first + loop_dist_start - + snarl_length - start_length; - snarl_dists.second = min_not_minus_one(new_right, - snarl_dists.second); - snarl_clusters.best_right =min_not_minus_one(snarl_clusters.best_right, - new_right); -#ifdef DEBUG -cerr << " (Possibly) updating looping distance to right of snarl cluster " << j << ": " + : snarl_dists.first + loop_dist_start + snarl_length - start_length; + snarl_dists.second = min_not_minus_one(new_right, snarl_dists.second); + snarl_clusters.fragment_best_right = + min_not_minus_one(snarl_clusters.fragment_best_right, new_right); + snarl_clusters.read_best_right[read_num] = + min_not_minus_one(snarl_clusters.read_best_right[read_num], new_right); +#ifdef DEBUG_CLUSTER +cerr << " (Possibly) updating looping distance to right of snarl cluster " << read_num <<":" << cluster_head.second << ": " << new_right << " -> " << snarl_dists.second << endl; #endif - if (snarl_clusters.best_left != -1 && snarl_dists.first != -1 ) { + if (snarl_clusters.read_best_left[read_num] != -1 && snarl_dists.first != -1 ) { //If this cluster can be combined with another cluster //from the left -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << " Combining this cluster from the left " ; #endif - combine_snarl_clusters(j, snarl_cluster_left, fragment_snarl_cluster_left, - to_erase, snarl_clusters.best_left + snarl_dists.first - + loop_dist_start - start_length - 1, snarl_dists); + combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], fragment_snarl_cluster_left, + to_erase, snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1, + snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1, + snarl_dists, read_num); } } if (loop_dist_end != -1) { //If there is a loop to the right - int64_t new_left = - snarl_dists.second == -1 || loop_dist_end == -1 - ? -1 - : snarl_dists.second + loop_dist_end + snarl_length - - end_length; - if (snarl_dists.first == -1 || (new_left != -1 & - new_left < snarl_dists.first)){ + int64_t new_left = snarl_dists.second == -1 || loop_dist_end == -1 + ? -1 + : snarl_dists.second + loop_dist_end + snarl_length - end_length; + if (snarl_dists.first == -1 || (new_left != -1 & new_left < snarl_dists.first)){ //If this is an improvement, update distances snarl_dists.first = new_left; - snarl_clusters.best_left = min_not_minus_one(new_left, - snarl_clusters.best_left); + snarl_clusters.read_best_left[read_num] = + min_not_minus_one(new_left, snarl_clusters.read_best_left[read_num]); + snarl_clusters.fragment_best_left = min_not_minus_one(new_left, snarl_clusters.fragment_best_left); -#ifdef DEBUG -cerr << "Updating looping distance to left of snarl cluster" << j << ": " +#ifdef DEBUG_CLUSTER +cerr << "Updating looping distance to left of snarl cluster " << read_num << ":" << cluster_head.second << ": " << new_left << endl; #endif } - if (snarl_clusters.best_right != -1 && snarl_dists.second != -1 ) { + if (snarl_clusters.read_best_right[read_num] != -1 && snarl_dists.second != -1 ) { //If this cluster can be combined with another cluster //from the right -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << " Combining this cluster from the right" << endl; #endif - combine_snarl_clusters(j, snarl_cluster_right, + combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num], fragment_snarl_cluster_right, to_erase, - snarl_clusters.best_right + snarl_dists.second - + loop_dist_end - end_length - 1, snarl_dists); + snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1, + snarl_clusters.read_best_right[read_num] + snarl_dists.second + loop_dist_end - end_length - 1, + snarl_dists, read_num); } } //Now check if this snarl cluster can be combined with any //existing chain clusters - if (old_chain_right != -1 && snarl_dists.first != -1 && - snarl_dists.first + old_chain_right - start_length-1 + if (read_chain_right[read_num] != -1 && snarl_dists.first != -1 && + snarl_dists.first + read_chain_right[read_num] - start_length-1 <= tree_state.read_distance_limit) { //If this snarl cluster's leftmost seed is close enough to //the rightmost seed in the chain (up to this point), then //this snarl cluster is in the combined cluster - if (combined_cluster == -1) { - combined_cluster = j; - combined_left = snarl_dists.first == -1 ? -1 : + if (combined_cluster[read_num] == -1) { + combined_cluster[read_num] = cluster_head.second; + combined_left[read_num] = snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left; - combined_right = snarl_dists.second; + combined_right[read_num] = snarl_dists.second; } else { //Cluster - tree_state.read_union_find.union_groups(combined_cluster, j); - size_t new_group = tree_state.read_union_find.find_group(j); - combined_cluster = new_group; - combined_left = min_not_minus_one(combined_left, - snarl_dists.first == -1 ? -1 : - snarl_dists.first + add_dist_left); - combined_right = min_not_minus_one(combined_right, - snarl_dists.second); + tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second); + combined_cluster[read_num] = tree_state.read_union_find[read_num].find_group(cluster_head.second); + combined_left[read_num] = min_not_minus_one(combined_left[read_num], + snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left); + combined_right[read_num] = min_not_minus_one(combined_right[read_num],snarl_dists.second); } if (tree_state.fragment_distance_limit != 0) { if (fragment_combined_cluster != -1) { //Also cluster by fragment - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, j); + tree_state.fragment_union_find.union_groups(fragment_combined_cluster, + cluster_head.second+tree_state.read_index_offsets[read_num]); } - fragment_combined_cluster = tree_state.fragment_union_find.find_group(j); + fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); } } else { //If the snarl cluster does not get combined with any of - //the existing chain clusters, then it becomes a new - //chain cluster - if (tree_state.fragment_distance_limit != 0 && - old_chain_right != -1 && snarl_dists.first != -1 && - snarl_dists.first + old_chain_right - start_length-1 - <= tree_state.fragment_distance_limit) { - //If this is a new read cluster but the same fragment cluster - if (fragment_combined_cluster == -1 ) { - fragment_combined_cluster = j; - } else { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, j); - fragment_combined_cluster = tree_state.fragment_union_find.find_group(j); + //the existing chain clusters, then it becomes a new chain cluster + if (tree_state.fragment_distance_limit != 0 && fragment_chain_right != -1 && snarl_dists.first != -1 && + snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.read_distance_limit) { + //Cluster in the same fragment but not the same read + if (fragment_combined_cluster != -1) { + //Also cluster by fragment + tree_state.fragment_union_find.union_groups(fragment_combined_cluster, + cluster_head.second+tree_state.read_index_offsets[read_num]); } + fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); } - to_add.push_back(j); + to_add.push_back(cluster_head); //Update its distances to the correct nodes in the chain - pair d = make_pair(snarl_dists.first == -1 - ? -1 : snarl_dists.first + add_dist_left, + pair d = make_pair(snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left, snarl_dists.second); - chain_clusters.best_left = min_not_minus_one(chain_clusters.best_left, + chain_clusters.fragment_best_left = min_not_minus_one(chain_clusters.fragment_best_left,d.first); + chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right,d.second); + chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], d.first); - chain_clusters.best_right = min_not_minus_one(chain_clusters.best_right, + chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], d.second); - tree_state.read_cluster_dists[j] = std::move(d); + tree_state.read_cluster_dists[read_num][cluster_head.second] = std::move(d); } } //Next, go through each of the clusters of the chain and decide //if they get combined with snarl clusters - for (size_t i : chain_clusters.cluster_heads) { + for (pair cluster_head : chain_clusters.read_cluster_heads) { //For each old chain cluster - pair& chain_dists = tree_state.read_cluster_dists[i]; + size_t read_num = cluster_head.first; + pair& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second]; - if (snarl_clusters.best_left != -1 && chain_dists.second != -1 - && chain_dists.second + snarl_clusters.best_left + if (snarl_clusters.read_best_left[read_num] != -1 && chain_dists.second != -1 + && chain_dists.second + snarl_clusters.read_best_left[read_num] - start_length-1 <= tree_state.read_distance_limit){ //If this chain cluster's rightmost seed is close enough //to the leftmost seed of any cluster in the snarl, then //this chain cluster is in the combined cluster - if (combined_cluster == -1) { - combined_cluster = i; - combined_left = chain_dists.first; - combined_right = chain_dists.second + dist_to_end; + if (combined_cluster[read_num] == -1) { + //New chain cluster + combined_cluster[read_num] = cluster_head.second; + combined_left[read_num] = chain_dists.first; + combined_right[read_num] = chain_dists.second + dist_to_end; } else { - tree_state.read_union_find.union_groups(combined_cluster, i); - size_t new_group = tree_state.read_union_find.find_group(i); - if (new_group == i) { - to_erase.push_back(combined_cluster); + //Combine + tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second); + size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second); + if (new_group == cluster_head.second) { + to_erase.emplace_back(read_num,combined_cluster[read_num]); } else { - to_erase.push_back(i); + to_erase.push_back(cluster_head); } - combined_cluster = new_group; - combined_left = min_not_minus_one(combined_left, - chain_dists.first); - combined_right = min_not_minus_one(combined_right, - chain_dists.second + dist_to_end); + combined_cluster[read_num] = new_group; + combined_left[read_num] = min_not_minus_one(combined_left[read_num], chain_dists.first); + combined_right[read_num] = min_not_minus_one(combined_right[read_num], chain_dists.second + dist_to_end); } if (tree_state.fragment_distance_limit != 0) { if (fragment_combined_cluster != -1) { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i); + tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]); } - fragment_combined_cluster = tree_state.fragment_union_find.find_group(i); + fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); } } else { //If this chain cluster is on its own, extend its right //distance to the end of the current snarl if (tree_state.fragment_distance_limit != 0 && - snarl_clusters.best_left != -1 && chain_dists.second != -1 - && chain_dists.second + snarl_clusters.best_left + snarl_clusters.fragment_best_left != -1 && chain_dists.second != -1 + && chain_dists.second + snarl_clusters.fragment_best_left - start_length-1 <= tree_state.fragment_distance_limit) { //If this is a new read cluster but the same fragment cluster - if (fragment_combined_cluster == -1 ) { - fragment_combined_cluster = i; - } else { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i); - fragment_combined_cluster = tree_state.fragment_union_find.find_group(i); + if (fragment_combined_cluster != -1) { + tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]); } + fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); } chain_dists.second += dist_to_end; if ((tree_state.fragment_distance_limit == 0 && @@ -860,49 +927,51 @@ cerr << " Combining this cluster from the right" << endl; //either end of the chain is greater than the distance //limit, then it cannot cluster with anything else //so we can stop keeping track of it -#ifdef DEBUG - cerr << "Removing cluster " << i << endl; +#ifdef DEBUG_CLUSTER + cerr << "Removing cluster " << cluster_head.first << ":" << cluster_head.second << endl; #endif - to_erase.push_back(i); + to_erase.push_back(cluster_head); } else { - chain_clusters.best_left = min_not_minus_one( - chain_clusters.best_left, chain_dists.first); - chain_clusters.best_right = - min_not_minus_one(chain_clusters.best_right, - chain_dists.second); + chain_clusters.fragment_best_left = min_not_minus_one(chain_clusters.fragment_best_left, chain_dists.first); + chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right, chain_dists.second); + chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], chain_dists.first); + chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], chain_dists.second); } } } //Update the chain cluster heads - for (size_t j : to_add) { - chain_clusters.cluster_heads.insert(j); + for (auto c : to_add) { + chain_clusters.read_cluster_heads.insert(c); } - for (size_t j : to_erase) { - chain_clusters.cluster_heads.erase(j); + for (auto c : to_erase) { + chain_clusters.read_cluster_heads.erase(c); } - if (combined_cluster != -1 ) { - chain_clusters.cluster_heads.insert(combined_cluster); - tree_state.read_cluster_dists[combined_cluster] = - make_pair(combined_left, combined_right); - chain_clusters.best_left = min_not_minus_one(chain_clusters.best_left, - combined_left); - chain_clusters.best_right = min_not_minus_one(chain_clusters.best_right, - combined_right); + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + if (combined_cluster[read_num] != -1 ) { + chain_clusters.read_cluster_heads.emplace(read_num, combined_cluster[read_num]); + tree_state.read_cluster_dists[read_num][combined_cluster[read_num]] = + make_pair(combined_left[read_num], combined_right[read_num]); + chain_clusters.fragment_best_left = min_not_minus_one(chain_clusters.fragment_best_left, combined_left[read_num]); + chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right, combined_right[read_num]); + chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], combined_left[read_num]); + chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], combined_right[read_num]); + + } } -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "\t finished with snarl " << snarl_index.id_in_parent - << "with best distances " << chain_clusters.best_left - << " " << chain_clusters.best_right + << "with best distances " << chain_clusters.fragment_best_left + << " " << chain_clusters.fragment_best_right << ", clusters:" < dists = tree_state.read_cluster_dists[c]; + for (pair c : chain_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; cerr << "\t\tleft: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t\t"; - for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) { - if (tree_state.read_union_find.find_group(x) == c) { - cerr << tree_state.seeds->at(x) << " "; + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; } } cerr << endl; @@ -916,17 +985,17 @@ cerr << " Combining this cluster from the right" << endl; if (last_rank != chain_index.prefix_sum.size() - 2) { //If the last snarl we traversed was not the end of the chain, //Extend the right bound of each cluster to the end of the chain - chain_clusters.best_right = -1; - int64_t last_dist = last_rank == 0 ? 0 : - chain_index.prefix_sum[last_rank] - 1; - int64_t dist_to_end = chain_index.chainLength() - - last_dist - last_len; - for (size_t i : chain_clusters.cluster_heads) { - int64_t d = tree_state.read_cluster_dists[i].second; - tree_state.read_cluster_dists[i].second = d == -1 ? -1 - : d + dist_to_end; - chain_clusters.best_right = min_not_minus_one(chain_clusters.best_right, - tree_state.read_cluster_dists[i].second); + chain_clusters.fragment_best_right = -1; + chain_clusters.read_best_right.assign(tree_state.all_seeds->size(), -1); + int64_t last_dist = last_rank == 0 ? 0 : chain_index.prefix_sum[last_rank] - 1; + int64_t dist_to_end = chain_index.chainLength() - last_dist - last_len; + for (pair cluster_head : chain_clusters.read_cluster_heads) { + int64_t d = tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second; + tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second = d == -1 ? -1: d + dist_to_end; + chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right, + tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second); + chain_clusters.read_best_right[cluster_head.first] = min_not_minus_one(chain_clusters.read_best_right[cluster_head.first], + tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second); } } @@ -936,98 +1005,100 @@ cerr << " Combining this cluster from the right" << endl; //looping around the chain // int64_t first_length = chain_index.prefix_sum[0]-1; - vector to_erase; //old cluster group ids + vector> to_erase; //old cluster group ids //New cluster- there will be at most one new cluster to add - size_t combined_cluster = -1; + vector combined_cluster (tree_state.all_seeds->size(), -1); size_t fragment_combined_cluster = -1; - for (size_t i : chain_clusters.cluster_heads) { + for (pair cluster_head : chain_clusters.read_cluster_heads) { //For each chain cluster - pair& chain_dists = tree_state.read_cluster_dists[i]; + size_t read_num = cluster_head.first; + pair& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second]; - if ((chain_dists.second != -1 && chain_clusters.best_left != -1 && - chain_dists.second + chain_clusters.best_left - first_length - 1 + if ((chain_dists.second != -1 && chain_clusters.read_best_left[read_num] != -1 && + chain_dists.second + chain_clusters.read_best_left[read_num] - first_length - 1 <= tree_state.read_distance_limit) || - (chain_dists.first != -1 && chain_clusters.best_right != -1 && - chain_dists.first + chain_clusters.best_right - first_length - 1 + (chain_dists.first != -1 && chain_clusters.read_best_right[read_num] != -1 && + chain_dists.first + chain_clusters.read_best_right[read_num] - first_length - 1 <= tree_state.read_distance_limit)){ //If this chain cluster is in the combined cluster - if (combined_cluster == -1) { - combined_cluster = i; + if (combined_cluster[read_num] == -1) { + combined_cluster[read_num] = cluster_head.second; } else { - tree_state.read_union_find.union_groups(combined_cluster, i); + tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second); if (tree_state.fragment_distance_limit != 0) { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i); + tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second + tree_state.all_seeds->size()); } - size_t new_group = tree_state.read_union_find.find_group(i); - if (new_group == i) { - to_erase.push_back(combined_cluster); + size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second); + if (new_group == cluster_head.second) { + to_erase.emplace_back(read_num, combined_cluster[read_num]); } else { - to_erase.push_back(i); + to_erase.emplace_back(read_num, cluster_head.second); } - combined_cluster = new_group; + combined_cluster[read_num] = new_group; } if (tree_state.fragment_distance_limit != 0) { - fragment_combined_cluster = tree_state.fragment_union_find.find_group(i); + fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second + tree_state.all_seeds->size()); } } else if (tree_state.fragment_distance_limit != 0 && - ((chain_dists.second != -1 && chain_clusters.best_left != -1 && - chain_dists.second + chain_clusters.best_left - first_length - 1 + ((chain_dists.second != -1 && chain_clusters.fragment_best_left != -1 && + chain_dists.second + chain_clusters.fragment_best_left - first_length - 1 <= tree_state.fragment_distance_limit) || - (chain_dists.first != -1 && chain_clusters.best_right != -1 && - chain_dists.first + chain_clusters.best_right - first_length - 1 + (chain_dists.first != -1 && chain_clusters.fragment_best_right != -1 && + chain_dists.first + chain_clusters.fragment_best_right - first_length - 1 <= tree_state.fragment_distance_limit))){ //If we can cluster by fragment - if (fragment_combined_cluster == -1 ) { - fragment_combined_cluster = i; - } else { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i); - fragment_combined_cluster = tree_state.fragment_union_find.find_group(i); + if (fragment_combined_cluster != -1) { + tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]); } + fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); + } } - for (size_t i : to_erase) { - chain_clusters.cluster_heads.erase(i); + for (auto c : to_erase) { + chain_clusters.read_cluster_heads.erase(c); } //Don't need to update best left and right distances because //a looping chain will be the top level chain } -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Found clusters on chain " << chain_index.id_in_parent << endl; - cerr << "best left : " << chain_clusters.best_left << " best right : " - << chain_clusters.best_right << endl; - for (size_t c : chain_clusters.cluster_heads) { + cerr << "best left : " << chain_clusters.fragment_best_left << " best right : " + << chain_clusters.fragment_best_right << endl; + for (pair c : chain_clusters.read_cluster_heads) { cerr << "\t"; - for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) { - if (tree_state.read_union_find.find_group(x) == c) { - cerr << tree_state.seeds->at(x) << " "; + for (size_t x = 0 ; x < tree_state.all_seeds->size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; } } cerr << endl; } bool got_left = false; bool got_right = false; - for (size_t c : chain_clusters.cluster_heads) { - pair dists = tree_state.read_cluster_dists[c]; + for (pair c : chain_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; if (!chain_index.is_looping_chain){ - assert(dists.first == -1 || dists.first >= chain_clusters.best_left); - assert(dists.second == -1 || dists.second >= chain_clusters.best_right); + assert(dists.first == -1 || dists.first >= chain_clusters.fragment_best_left); + assert(dists.second == -1 || dists.second >= chain_clusters.fragment_best_right); + assert(dists.first == -1 || dists.first >= chain_clusters.read_best_left[c.first]); + assert(dists.second == -1 || dists.second >= chain_clusters.read_best_right[c.first]); } - if (dists.first == chain_clusters.best_left) {got_left = true;} - if (dists.second == chain_clusters.best_right) {got_right = true;} - cerr << "\t" << c << ": left: " << dists.first << " right : " + if (dists.first == chain_clusters.fragment_best_left) {got_left = true;} + if (dists.second == chain_clusters.fragment_best_right) {got_right = true;} + cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : " << dists.second << endl; } if (!chain_index.is_looping_chain) { assert(got_left); assert(got_right); } - for (size_t group_id : chain_clusters.cluster_heads) { + for (pair group_id : chain_clusters.read_cluster_heads) { - assert (group_id == tree_state.read_union_find.find_group(group_id)); + assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second)); } #endif @@ -1042,66 +1113,69 @@ cerr << " Combining this cluster from the right" << endl; * Nodes have not yet been clustered */ MinimumDistanceIndex::SnarlIndex& snarl_index = dist_index.snarl_indexes[snarl_index_i]; -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Finding clusters on snarl number " << snarl_index_i << " headed by node " << snarl_index.id_in_parent << endl; #endif //Keep track of all clusters on this snarl - NodeClusters snarl_clusters; + NodeClusters snarl_clusters(tree_state.all_seeds->size()); auto combine_clusters = [&] (size_t& new_group, size_t& combined_group, - size_t& fragment_combined_group, int64_t dist, - pair& dists){ + size_t& fragment_combined_group, int64_t read_dist, + int64_t fragment_dist, + pair& end_dists, size_t read_num){ //Helper function to compare and combine clusters in two nodes of the same snarl - //If the distance (dist) between two clusters is small enough, then combine them + //If the distance between two clusters is small enough, then combine them //for the read clusters and, if applicable, for the fragment clusters //Updates the distances stored for the read clusters - if (dist <= tree_state.read_distance_limit) { - //If the clusters are close enough to combine the reads + if (read_dist <= tree_state.read_distance_limit) { + //If the clusters are close enough to combine in the read if (tree_state.fragment_distance_limit != 0) { if (fragment_combined_group != -1) { //Also combine fragment clusters - tree_state.fragment_union_find.union_groups(new_group, fragment_combined_group); + tree_state.fragment_union_find.union_groups(new_group+tree_state.read_index_offsets[read_num], + fragment_combined_group); } - fragment_combined_group = tree_state.fragment_union_find.find_group(new_group); + fragment_combined_group = tree_state.fragment_union_find.find_group(new_group+tree_state.read_index_offsets[read_num]); } if (combined_group == -1) { - snarl_clusters.cluster_heads.insert(new_group); - tree_state.read_cluster_dists[new_group] = dists; + snarl_clusters.read_cluster_heads.emplace(read_num,new_group); + tree_state.read_cluster_dists[read_num][new_group] = end_dists; combined_group = new_group; } else { - //Combine the clusters + //Combine the clusters within the same read - combined_group = tree_state.read_union_find.find_group(combined_group); - pairold_dists = tree_state.read_cluster_dists[combined_group]; - tree_state.read_union_find.union_groups(new_group, combined_group); + combined_group = tree_state.read_union_find[read_num].find_group(combined_group); + pairold_dists = tree_state.read_cluster_dists[read_num][combined_group]; + tree_state.read_union_find[read_num].union_groups(new_group, combined_group); //Update distances and cluster head of new cluster - size_t new_g = tree_state.read_union_find.find_group(new_group); + size_t new_g = tree_state.read_union_find[read_num].find_group(new_group); if (new_g != new_group) { - snarl_clusters.cluster_heads.erase(new_group); + snarl_clusters.read_cluster_heads.erase(make_pair(read_num,new_group)); } if (new_g != combined_group) { - snarl_clusters.cluster_heads.erase(combined_group); + snarl_clusters.read_cluster_heads.erase(make_pair(read_num,combined_group)); } - snarl_clusters.cluster_heads.insert(new_g); - dists = make_pair( - min_not_minus_one(dists.first, old_dists.first), - min_not_minus_one(dists.second, old_dists.second)); - tree_state.read_cluster_dists[new_g] = dists; + snarl_clusters.read_cluster_heads.emplace(read_num,new_g); + end_dists = make_pair( + min_not_minus_one(end_dists.first, old_dists.first), + min_not_minus_one(end_dists.second, old_dists.second)); + tree_state.read_cluster_dists[read_num][new_g] = end_dists; new_group = new_g; combined_group = new_g; } } else if (tree_state.fragment_distance_limit != 0 - && dist <= tree_state.fragment_distance_limit) { + && fragment_dist <= tree_state.fragment_distance_limit) { //Same fragment if (fragment_combined_group == -1) { fragment_combined_group = new_group; } else { - tree_state.fragment_union_find.union_groups(new_group, fragment_combined_group); + tree_state.fragment_union_find.union_groups( + new_group + tree_state.read_index_offsets[read_num], fragment_combined_group); fragment_combined_group = tree_state.fragment_union_find.find_group(new_group); } } @@ -1118,7 +1192,7 @@ cerr << " Combining this cluster from the right" << endl; //Maps each cluster of child nodes to its left and right distances //of the node its on - hash_map> old_dists; + hash_map, pair> old_dists; for (size_t i = 0; i < child_nodes.size() ; i++) { //Go through each child node of the netgraph and get clusters @@ -1147,7 +1221,7 @@ cerr << " Combining this cluster from the right" << endl; //Represents all the clusters on this child node NodeClusters& curr_child_clusters = child_nodes[i].second; -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Finding distances to parent snarl " << snarl_index_i << " ends from child " << i << "/" << child_nodes.size() << endl; cerr << "Child is " << typeToString(child.node_type) << " number " @@ -1155,13 +1229,13 @@ cerr << " Combining this cluster from the right" << endl; cerr << "Node rank is " << node_rank << " fwd, " << rev_rank << " rev of " << snarl_index.num_nodes * 2 << endl; cerr << "Clusters at this child:" << endl; - for (size_t c : child_nodes[i].second.cluster_heads) { - cerr << "\tdist left: " << tree_state.read_cluster_dists[c].first - << " dist right: " << tree_state.read_cluster_dists[c].second << endl; + for (pair c : child_nodes[i].second.read_cluster_heads) { + cerr << "\tdist left: " << tree_state.read_cluster_dists[c.first][c.second].first + << " dist right: " << tree_state.read_cluster_dists[c.first][c.second].second << endl; cerr << "\t\t"; - for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) { - if (tree_state.read_union_find.find_group(x) == c) { - cerr << tree_state.seeds->at(x) << " "; + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; } } cerr << endl; @@ -1171,33 +1245,38 @@ cerr << " Combining this cluster from the right" << endl; assert(node_rank != numeric_limits::max()); #endif - vector children_i( - make_move_iterator(curr_child_clusters.cluster_heads.begin()), - make_move_iterator(curr_child_clusters.cluster_heads.end())); + vector> children_i( + make_move_iterator(curr_child_clusters.read_cluster_heads.begin()), + make_move_iterator(curr_child_clusters.read_cluster_heads.end())); for (size_t c_i = 0 ; c_i < children_i.size() ; c_i ++) { //for each cluster of child node i, find the distances to the //ends of the snarl - size_t c = children_i[c_i]; + pair child_cluster_head = children_i[c_i]; - pair dists_c= tree_state.read_cluster_dists[c]; - old_dists[c] = dists_c; + pair dists_c = tree_state.read_cluster_dists[child_cluster_head.first][child_cluster_head.second]; + old_dists[child_cluster_head] = dists_c; + //TODO: Do this only once pair new_dists = snarl_index.distToEnds(node_rank, dists_c.first,dists_c.second); -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_parent << " : " << new_dists.first << " " << new_dists.second << endl; #endif - snarl_clusters.best_left =min_not_minus_one(snarl_clusters.best_left, - new_dists.first); - snarl_clusters.best_right = min_not_minus_one( - snarl_clusters.best_right, new_dists.second); + snarl_clusters.fragment_best_left =min_not_minus_one( + snarl_clusters.fragment_best_left,new_dists.first); + snarl_clusters.fragment_best_right = min_not_minus_one( + snarl_clusters.fragment_best_right, new_dists.second); + snarl_clusters.read_best_left[child_cluster_head.first] =min_not_minus_one( + snarl_clusters.read_best_left[child_cluster_head.first], new_dists.first); + snarl_clusters.read_best_right[child_cluster_head.first] = min_not_minus_one( + snarl_clusters.read_best_right[child_cluster_head.first], new_dists.second); - snarl_clusters.cluster_heads.insert(c); - tree_state.read_cluster_dists[c] = new_dists; + snarl_clusters.read_cluster_heads.insert(child_cluster_head); + tree_state.read_cluster_dists[child_cluster_head.first][child_cluster_head.second] = new_dists; } @@ -1209,19 +1288,19 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa NodeClusters& other_node_clusters = child_nodes[j].second; id_t other_node_id = other_node.id_in_parent(dist_index); + //Rank of this node in the snarl + size_t other_rank = other_node.rank_in_parent(dist_index, + other_node_id); + size_t other_rev = other_rank % 2 == 0 + ? other_rank + 1 : other_rank - 1; -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Other net graph node is " << typeToString(other_node.node_type) << " headed by node " << other_node_id; #endif - //Rank of this node in the snarl - size_t other_rank = other_node.rank_in_parent(dist_index, - other_node_id); - size_t other_rev = other_rank % 2 == 0 - ? other_rank + 1 : other_rank - 1; //Find distance from each end of current node (i) to //each end of other node (j) @@ -1233,7 +1312,7 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa node_rank, other_rank); int64_t dist_r_r = snarl_index.snarlDistance( node_rank, other_rev); -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "\t distances between ranks " << node_rank << " and " << other_rank << ": " << dist_l_l << " " << dist_l_r << " " << dist_r_l << " " << dist_r_r << endl; @@ -1241,10 +1320,10 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank //group ids of clusters combined between node i left and //node j left, etc - size_t group_l_l = -1; - size_t group_l_r = -1; - size_t group_r_l = -1; - size_t group_r_r = -1; + vector group_l_l (tree_state.all_seeds->size(), -1); + vector group_l_r (tree_state.all_seeds->size(), -1); + vector group_r_l (tree_state.all_seeds->size(), -1); + vector group_r_r (tree_state.all_seeds->size(), -1); size_t fragment_group_l_l = -1; size_t fragment_group_l_r = -1; size_t fragment_group_r_l = -1; @@ -1254,125 +1333,141 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank && ((tree_state.fragment_distance_limit == 0 && MinimumDistanceIndex::minPos({dist_l_l, dist_l_r, dist_r_l, dist_r_r})-2 <= tree_state.read_distance_limit - && min_not_minus_one(curr_child_clusters.best_left, curr_child_clusters.best_right)-2 + && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2 <= tree_state.read_distance_limit) || (tree_state.fragment_distance_limit != 0 && MinimumDistanceIndex::minPos({dist_l_l, dist_l_r, dist_r_l, dist_r_r})-2 <= tree_state.fragment_distance_limit - && min_not_minus_one(curr_child_clusters.best_left, curr_child_clusters.best_right)-2 + && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2 <= tree_state.fragment_distance_limit) )) { //If the two nodes are reachable for (size_t c_i = 0 ; c_i < children_i.size() ; c_i ++) { //for each cluster of child node i - size_t c = children_i[c_i]; - size_t c_group = tree_state.read_union_find.find_group(c); + pair child_cluster_head = children_i[c_i]; + size_t read_num = child_cluster_head.first; + size_t c_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second); - pair new_dists; - pair dists_c; - - dists_c = old_dists[c]; - new_dists = tree_state.read_cluster_dists[c_group]; + pair new_dists = tree_state.read_cluster_dists[read_num][c_group]; + pair dists_c = old_dists[child_cluster_head]; if (dist_l_l != -1 && dists_c.first != -1 - && other_node_clusters.best_left != -1 ) { - //If cluster c can be combined with clusters in j + && other_node_clusters.fragment_best_left != -1 ) { + //If cluster child_cluster_head can be combined with clusters in j //from the left of both of them - combine_clusters(c_group, group_l_l, fragment_group_l_l, - dist_l_l + dists_c.first + other_node_clusters.best_left-1, new_dists); + combine_clusters(c_group, group_l_l[read_num], fragment_group_l_l, + dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1, + dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1, + new_dists, read_num); } if (dist_l_r != -1 && dists_c.first != -1 - && other_node_clusters.best_right != -1 ) { + && other_node_clusters.fragment_best_right != -1 ) { //If it can be combined from the left to the right of j - combine_clusters(c_group, group_l_r, fragment_group_l_r, - dist_l_r + dists_c.first + other_node_clusters.best_right-1, new_dists); + combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r, + dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1, + dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1, + new_dists, read_num); } if (dist_r_l != -1 && dists_c.second != -1 - && other_node_clusters.best_left != -1 ) { - combine_clusters(c_group, group_r_l, fragment_group_r_l, - dist_r_l + dists_c.second + other_node_clusters.best_left-1, new_dists); + && other_node_clusters.fragment_best_left != -1 ) { + combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l, + dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1, + dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1, + new_dists, read_num); } if (dist_r_r != -1 && dists_c.second != -1 - && other_node_clusters.best_right != -1 ) { - combine_clusters(c_group, group_r_r, fragment_group_r_r, - dist_r_r + dists_c.second + other_node_clusters.best_right-1, new_dists); + && other_node_clusters.fragment_best_right != -1 ) { + combine_clusters(c_group, group_r_r[read_num], fragment_group_r_r, + dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1, + dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1, + new_dists, read_num); } } //Go through children of j - vector children_j( - make_move_iterator(other_node_clusters.cluster_heads.begin()), - make_move_iterator(other_node_clusters.cluster_heads.end())); + vector> children_j( + make_move_iterator(other_node_clusters.read_cluster_heads.begin()), + make_move_iterator(other_node_clusters.read_cluster_heads.end())); for (size_t k_i = 0 ; k_i < children_j.size() ; k_i++){ - size_t k = children_j[k_i]; //For each cluster of child j, find which overlaps with //clusters of i - //k will already be part of a cluster in + //child_cluster_head will already be part of a cluster in //snarlcluster heads but since we need to know the node //that the snarl is on we can't just loop through //snarl_cluster heads - pair& dist_bounds_k = old_dists[k]; - size_t k_group = tree_state.read_union_find.find_group(k); - pair dists_k = tree_state.read_cluster_dists[k_group]; + pair child_cluster_head = children_j[k_i]; + size_t read_num = child_cluster_head.first; + pair& dist_bounds_k = old_dists[child_cluster_head]; + size_t k_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second); + pair dists_k = tree_state.read_cluster_dists[read_num][k_group]; - if (dist_l_l != -1 && curr_child_clusters.best_left != -1 + if (dist_l_l != -1 && curr_child_clusters.read_best_left[read_num] != -1 && dist_bounds_k.first != -1 ){ - combine_clusters(k_group, group_l_l, fragment_group_l_l, - dist_l_l + curr_child_clusters.best_left + dist_bounds_k.first-1, dists_k); + combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l, + dist_l_l + curr_child_clusters.fragment_best_left + dist_bounds_k.first-1, + dist_l_l + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.first-1, + dists_k, read_num); } - if (dist_l_r != -1 && curr_child_clusters.best_left != -1 + if (dist_l_r != -1 && curr_child_clusters.read_best_left[read_num] != -1 && dist_bounds_k.second != -1 ) { - combine_clusters(k_group, group_l_r, fragment_group_l_r, - dist_l_r + curr_child_clusters.best_left + dist_bounds_k.second-1, dists_k); + combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r, + dist_l_r + curr_child_clusters.fragment_best_left + dist_bounds_k.second-1, + dist_l_r + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.second-1, + dists_k, read_num); } - if (dist_r_l != -1 && curr_child_clusters.best_right != -1 + if (dist_r_l != -1 && curr_child_clusters.read_best_right[read_num] != -1 && dist_bounds_k.first != -1 ) { - combine_clusters(k_group, group_r_l, fragment_group_r_l, - dist_r_l + curr_child_clusters.best_right + dist_bounds_k.first-1, dists_k); + combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l, + dist_r_l + curr_child_clusters.fragment_best_right + dist_bounds_k.first-1, + dist_r_l + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.first-1, + dists_k,read_num); } - if (dist_r_r != -1 && curr_child_clusters.best_right != -1 + if (dist_r_r != -1 && curr_child_clusters.read_best_right[read_num] != -1 && dist_bounds_k.second != -1 ) { - combine_clusters(k_group, group_r_r, fragment_group_r_r, - dist_r_r + curr_child_clusters.best_right + dist_bounds_k.second-1, dists_k); + combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r, + dist_r_r + curr_child_clusters.fragment_best_right + dist_bounds_k.second-1, + dist_r_r + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.second-1, + dists_k, read_num); } } } } } -#ifdef DEBUG +#ifdef DEBUG_CLUSTER cerr << "Found clusters on snarl number " << snarl_index_i << " headed by" << snarl_index.id_in_parent << endl; - cerr << " with best left and right values: " << snarl_clusters.best_left << " " - << snarl_clusters.best_right << endl; + cerr << " with best left and right values: " << snarl_clusters.fragment_best_left << " " + << snarl_clusters.fragment_best_right << endl; bool got_left = false; bool got_right = false; - for (size_t c : snarl_clusters.cluster_heads) { - pair dists = tree_state.read_cluster_dists[c]; - if (dists.first == snarl_clusters.best_left) {got_left = true;} - if (dists.second == snarl_clusters.best_right) {got_right = true;} - cerr << "\t" << c << ": left: " << dists.first << " right : " + for (pair c : snarl_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + if (dists.first == snarl_clusters.fragment_best_left) {got_left = true;} + if (dists.second == snarl_clusters.fragment_best_right) {got_right = true;} + cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t"; - for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) { - if (tree_state.read_union_find.find_group(x) == c) { - cerr << tree_state.seeds->at(x) << " "; + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; } } cerr << endl; } assert(got_left); assert(got_right); - for (size_t group_id : snarl_clusters.cluster_heads) { - assert (group_id == tree_state.read_union_find.find_group(group_id)); + + for (pair group_id : snarl_clusters.read_cluster_heads) { + assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second)); } #endif return snarl_clusters; diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp index 03c4d20346c..d90410798ae 100644 --- a/src/seed_clusterer.hpp +++ b/src/seed_clusterer.hpp @@ -14,6 +14,8 @@ class SnarlSeedClusterer { SnarlSeedClusterer(MinimumDistanceIndex& dist_index); + typedef vector> cluster_group_t; + ///Given a vector of seeds (pos_t) and a distance limit, //cluster the seeds such that two seeds whose minimum distance //between them (including both of the positions) is less than @@ -21,8 +23,7 @@ class SnarlSeedClusterer { // //Returns a vector of clusters. Each cluster is a vector of //indices into seeds - vector> cluster_seeds ( - vector seeds, int64_t read_distance_limit) const; + cluster_group_t cluster_seeds ( vector seeds, int64_t read_distance_limit) const; ///The same thing, but for paired end reads. //Given seeds from multiple reads of a fragment, cluster each set of seeds @@ -32,7 +33,7 @@ class SnarlSeedClusterer { //The read clusters refer to seeds by their indexes in the input vectors of seeds //The fragment clusters give seeds the index they would get if the vectors of // seeds were appended to each other in the order given - tuple>>,vector>> cluster_seeds ( + tuple, cluster_group_t> cluster_seeds ( vector> all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit=0) const; @@ -41,6 +42,7 @@ class SnarlSeedClusterer { MinimumDistanceIndex& dist_index; enum ChildNodeType {CHAIN, SNARL, NODE}; + static inline string typeToString(ChildNodeType t) { switch (t) { @@ -105,18 +107,24 @@ class SnarlSeedClusterer { // snarl/chain that is a node the parent snarl's netgraph, // or a snarl in a chain + //set of the indices of heads of clusters (group ids in the //union find) - hash_set cluster_heads; + //TODO: Add cluster distances here + //pair of read index, seed index + hash_set> read_cluster_heads; //The shortest distance from any seed in any cluster to the //left/right end of the snarl tree node that contains these //clusters - int64_t best_left; - int64_t best_right; - - NodeClusters() : - best_left(-1), best_right(-1) {} + int64_t fragment_best_left; + int64_t fragment_best_right; + vector read_best_left; + vector read_best_right; + + NodeClusters(size_t read_count) : + fragment_best_left(-1), fragment_best_right(-1), + read_best_left(read_count, -1), read_best_right(read_count, -1){} }; @@ -130,7 +138,7 @@ class SnarlSeedClusterer { vector>* all_seeds; //Vector of the offset of indices for each seed - vector seed_index_offsets; + vector read_index_offsets; //The minimum distance between nodes for them to be put in the //same cluster @@ -148,7 +156,8 @@ class SnarlSeedClusterer { //of the netgraph node of the cluster it belongs to //These values are only relevant for seeds that represent a cluster //in union_find_reads - vector> read_cluster_dists; + vector>> read_cluster_dists; + vector> fragment_cluster_dists; @@ -158,7 +167,7 @@ class SnarlSeedClusterer { //Maps each node to a vector of the seeds that are contained in it //seeds are represented by indexes into the seeds vector //The array is sorted. - vector> node_to_seeds; + vector>> node_to_seeds; //Map from snarl (index into dist_index.snarl_indexes) i //to the netgraph nodes contained in the snarl as well as the @@ -185,18 +194,23 @@ class SnarlSeedClusterer { //Constructor takes in a pointer to the seeds and the distance limit TreeState (vector>* all_seeds, int64_t read_distance_limit, - int64_t fragment_distance_limit) : + int64_t fragment_distance_limit, size_t seed_count) : all_seeds(all_seeds), - read_cluster_dists(seeds->size(), make_pair(-1, -1)), - read_union_find (seeds->size(), false), - fragment_union_find (seeds->size(), false), + fragment_cluster_dists(all_seeds->size(), make_pair(-1, -1)), read_distance_limit(read_distance_limit), - fragment_distance_limit(fragment_distance_limit){ - seed_index_offsets.push_back(0); - for (auto& v : all_seeds) { - size_t offset = seed_index_offsets.back() + v.size(); - seed_index_offsets.push_back(offset); - } + fragment_distance_limit(fragment_distance_limit), + fragment_union_find (seed_count, false) { + + read_index_offsets.push_back(0); + size_t total_seeds = 0; + for (vector& v : *all_seeds) { + total_seeds += v.size(); + size_t offset = read_index_offsets.back() + v.size(); + read_index_offsets.push_back(offset); + read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1)); + node_to_seeds.emplace_back(); + read_union_find.emplace_back(v.size(), false); + } } }; diff --git a/src/subcommand/cluster_main.cpp b/src/subcommand/cluster_main.cpp index f44945f60c5..ea8ae030af0 100644 --- a/src/subcommand/cluster_main.cpp +++ b/src/subcommand/cluster_main.cpp @@ -257,8 +257,7 @@ int main_cluster(int argc, char** argv) { // Cluster the seeds. Get sets of input seed indexes that go together. // Make sure to time it. std::chrono::time_point start = std::chrono::system_clock::now(); - tuple>,vector>> paired_clusters = clusterer.cluster_seeds(seeds, distance_limit); - vector> clusters = std::move(std::get<0>(paired_clusters)); + vector> clusters = clusterer.cluster_seeds(seeds, distance_limit); std::chrono::time_point end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp index 0e2899224c6..25d7a287e37 100644 --- a/src/unittest/seed_clusterer.cpp +++ b/src/unittest/seed_clusterer.cpp @@ -75,9 +75,7 @@ namespace unittest { seeds.push_back(make_pos_t(n, false, 0)); } - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 10); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 10); REQUIRE(clusters.size() == 1); } @@ -92,9 +90,7 @@ namespace unittest { } - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 7); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 7); vector> cluster_sets; for (vector v : clusters) { hash_set h; @@ -123,7 +119,8 @@ namespace unittest { } SECTION( "One fragment cluster" ) { - vector seed_nodes( {2, 3, 4, 7, 8, 10, 11}); + vector seed_nodes( {2, 3, 4}); + vector seed_nodes1({7, 8, 10, 11}); //Clusters should be {2, 3, 4}, {7, 8, 10, 11} //One fragment cluster //Distance from pos on 4 to pos on 7 is 8, including one position @@ -131,23 +128,40 @@ namespace unittest { for (id_t n : seed_nodes) { seeds.push_back(make_pos_t(n, false, 0)); } + vector seeds1; + for (id_t n : seed_nodes1) { + seeds.push_back(make_pos_t(n, false, 0)); + } + vector> all_seeds; + all_seeds.push_back(seeds); + all_seeds.push_back(seeds1); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 7, 15); - vector> clusters = std::get<0>(paired_clusters); + tuple>>, vector>> paired_clusters = + clusterer.cluster_seeds(all_seeds, 7, 15); + vector>> read_clusters = std::get<0>(paired_clusters); + //Should be [[[0,1,2]],[[3,4,5,6]]] vector> fragment_clusters = std::get<1>(paired_clusters); vector> cluster_sets; - for (vector v : clusters) { + for (vector v : read_clusters[0]) { hash_set h; for (size_t s : v) { h.insert(s); } cluster_sets.push_back(h); } - REQUIRE( clusters.size() == 2); + for (vector v : read_clusters[1]) { + hash_set h; + for (size_t s : v) { + h.insert(s); + } + cluster_sets.push_back(h); + } + REQUIRE( read_clusters.size() == 2); + REQUIRE( (read_clusters[0].size() == 3 || read_clusters[1].size() == 3)); + REQUIRE( (read_clusters[0].size() == 4 || read_clusters[1].size() == 4)); REQUIRE( fragment_clusters.size() == 1); - REQUIRE (( (cluster_sets[0].count(0) == 1 && + REQUIRE (((cluster_sets[0].count(0) == 1 && cluster_sets[0].count(1) == 1 && cluster_sets[0].count(2) == 1 && cluster_sets[1].count(3) == 1 && @@ -166,19 +180,29 @@ namespace unittest { } SECTION( "Two fragment clusters" ) { - vector seed_nodes( {2, 3, 4, 7, 8, 10, 11}); + vector seed_nodes( {2, 3, 4}); + vector seed_nodes1({7, 8, 10, 11}); //Fragment clusters should be {2, 3, 4}, {7, 8, 10, 11} //Distance from pos on 4 to pos on 7 is 8, including one position vector seeds; for (id_t n : seed_nodes) { seeds.push_back(make_pos_t(n, false, 0)); } + vector seeds1; + for (id_t n : seed_nodes1) { + seeds.push_back(make_pos_t(n, false, 0)); + } + vector> all_seeds; + all_seeds.push_back(seeds); + all_seeds.push_back(seeds1); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 2, 7); - vector> clusters = std::get<0>(paired_clusters); + tuple>>, vector>> paired_clusters = + clusterer.cluster_seeds(all_seeds, 2, 7); + vector>> read_clusters = std::get<0>(paired_clusters); + // read_clusters = [ [[0,1,2]],[[3,4,5,6]] ] vector> fragment_clusters = std::get<1>(paired_clusters); + // fragment_clusters = [ [0,1,2], [3,4,5,6] ] vector> fragment_cluster_sets; for (vector v : fragment_clusters) { hash_set h; @@ -187,7 +211,7 @@ namespace unittest { } fragment_cluster_sets.push_back(h); } - REQUIRE( clusters.size() == 3); + REQUIRE( read_clusters.size() == 2); REQUIRE( fragment_clusters.size() == 2); REQUIRE (( (fragment_cluster_sets[0].count(0) == 1 && fragment_cluster_sets[0].count(1) == 1 && @@ -251,9 +275,7 @@ namespace unittest { seeds.push_back(make_pos_t(4, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 13); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 13); REQUIRE( clusters.size() == 1); } @@ -262,9 +284,7 @@ namespace unittest { seeds.push_back(make_pos_t(3, false, 0)); seeds.push_back(make_pos_t(11, false, 9)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 8); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 8); REQUIRE( clusters.size() == 1); @@ -313,9 +333,7 @@ namespace unittest { seeds.push_back(make_pos_t(7, false, 0)); seeds.push_back(make_pos_t(6, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 20); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 20); REQUIRE( clusters.size() == 1); @@ -325,9 +343,7 @@ namespace unittest { seeds.push_back(make_pos_t(2, false, 0)); seeds.push_back(make_pos_t(6, true, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 20); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 20); } @@ -336,9 +352,7 @@ namespace unittest { seeds.push_back(make_pos_t(8, false, 0)); seeds.push_back(make_pos_t(6, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 20); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 20); REQUIRE( clusters.size() == 1); @@ -411,9 +425,7 @@ namespace unittest { seeds.push_back(make_pos_t(6, false, 0)); seeds.push_back(make_pos_t(8, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 3); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 3); REQUIRE( clusters.size() == 2); vector> cluster_sets; @@ -455,30 +467,58 @@ namespace unittest { seeds.push_back(make_pos_t(14, false, 0)); seeds.push_back(make_pos_t(15, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 3); - vector> clusters = std::get<0>(paired_clusters); - vector> fragment_clusters = std::get<1>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 3); REQUIRE( clusters.size() == 4); - REQUIRE( fragment_clusters.size() == seeds.size()); + + vector> all_seeds; + all_seeds.push_back(seeds); + tuple>>, vector>> paired_clusters = clusterer.cluster_seeds(all_seeds, 3, 3); + vector>> read_clusters = std::get<0>(paired_clusters); + vector>fragment_clusters = std::get<1>(paired_clusters); + + REQUIRE( read_clusters.size() == 1); + REQUIRE( read_clusters[0].size() == 1); + REQUIRE( fragment_clusters.size() == 4); //New fragment clusters + } SECTION ("Four fragment clusters") { + vector> all_seeds; + vector seeds; + seeds.push_back(make_pos_t(3, false, 0)); + seeds.push_back(make_pos_t(5, false, 0)); + seeds.push_back(make_pos_t(16, false, 0)); + //New cluster + seeds.push_back(make_pos_t(6, false, 0)); + seeds.push_back(make_pos_t(8, false, 0)); + all_seeds.push_back(seeds); + seeds.clear(); + //New cluster + seeds.push_back(make_pos_t(5, false, 8)); + //New cluster + seeds.push_back(make_pos_t(13, false, 1)); + seeds.push_back(make_pos_t(14, false, 0)); + seeds.push_back(make_pos_t(15, false, 0)); + all_seeds.push_back(seeds); - paired_clusters = clusterer.cluster_seeds(seeds, 3, 3); - clusters = std::get<0>(paired_clusters); - fragment_clusters = std::get<1>(paired_clusters); + tuple>>, vector>> paired_clusters = clusterer.cluster_seeds(all_seeds, 3, 3); + vector>> read_clusters = std::get<0>(paired_clusters); + vector> fragment_clusters = std::get<1>(paired_clusters); - REQUIRE( clusters.size() == 4); + REQUIRE( read_clusters.size() == 2); + REQUIRE( read_clusters[0].size() == 2); + REQUIRE( read_clusters[1].size() == 2); REQUIRE( fragment_clusters.size() == 4); //New fragment clusters - paired_clusters = clusterer.cluster_seeds(seeds, 3, 5); - clusters = std::get<0>(paired_clusters); + paired_clusters = clusterer.cluster_seeds(all_seeds, 3, 5); + read_clusters = std::get<0>(paired_clusters); fragment_clusters = std::get<1>(paired_clusters); - REQUIRE( clusters.size() == 4); + REQUIRE( read_clusters.size() == 2); + REQUIRE( read_clusters[0].size() == 2); + REQUIRE( read_clusters[1].size() == 2); REQUIRE( fragment_clusters.size() == 2); } SECTION( "Same node, same cluster" ) { @@ -487,9 +527,7 @@ namespace unittest { seeds.push_back(make_pos_t(5, false, 11)); seeds.push_back(make_pos_t(5, false, 5)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 7); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 7); REQUIRE( clusters.size() == 1); @@ -535,9 +573,7 @@ namespace unittest { seeds.push_back(make_pos_t(2, false, 0)); seeds.push_back(make_pos_t(7, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 10); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters= clusterer.cluster_seeds(seeds, 10); REQUIRE( clusters.size() == 1); @@ -549,9 +585,7 @@ namespace unittest { seeds.push_back(make_pos_t(7, false, 0)); seeds.push_back(make_pos_t(4, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 10); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 10); REQUIRE( clusters.size() == 1); @@ -561,9 +595,7 @@ namespace unittest { seeds.push_back(make_pos_t(2, false, 0)); seeds.push_back(make_pos_t(4, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 10); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 10); @@ -575,9 +607,7 @@ namespace unittest { seeds.push_back(make_pos_t(4, false, 1)); seeds.push_back(make_pos_t(6, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 5); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 5); REQUIRE( clusters.size() == 2); @@ -585,9 +615,7 @@ namespace unittest { SECTION("No clusters") { vector seeds; - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 5); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 5); REQUIRE( clusters.size() == 0); @@ -641,9 +669,7 @@ namespace unittest { seeds.push_back(make_pos_t(3, false, 0)); seeds.push_back(make_pos_t(9, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 5); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 5); REQUIRE( clusters.size() == 2); @@ -693,9 +719,7 @@ namespace unittest { seeds.push_back(make_pos_t(3, false, 0)); seeds.push_back(make_pos_t(8, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 3); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 3); REQUIRE( clusters.size() == 2); @@ -707,9 +731,7 @@ namespace unittest { seeds.push_back(make_pos_t(2, false, 0)); seeds.push_back(make_pos_t(7, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 6); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 6); REQUIRE( clusters.size() == 1); @@ -721,9 +743,7 @@ namespace unittest { seeds.push_back(make_pos_t(8, false, 0)); seeds.push_back(make_pos_t(10, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 3); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 3); REQUIRE( clusters.size() == 1); @@ -770,9 +790,7 @@ namespace unittest { seeds.push_back(make_pos_t(3, false, 0)); seeds.push_back(make_pos_t(4, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 10); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 10); REQUIRE( clusters.size() == 1); @@ -782,9 +800,7 @@ namespace unittest { seeds.push_back(make_pos_t(5, false, 0)); seeds.push_back(make_pos_t(3, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 10); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 10); REQUIRE( clusters.size() == 1); @@ -795,9 +811,7 @@ namespace unittest { seeds.push_back(make_pos_t(3, false, 0)); seeds.push_back(make_pos_t(8, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 3); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 3); @@ -808,9 +822,7 @@ namespace unittest { seeds.push_back(make_pos_t(2, false, 0)); seeds.push_back(make_pos_t(3, false, 0)); - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, 15); - vector> clusters = std::get<0>(paired_clusters); + vector> clusters = clusterer.cluster_seeds(seeds, 15); REQUIRE( clusters.size() == 1); @@ -840,116 +852,127 @@ namespace unittest { uniform_int_distribution randSnarlIndex(0, allSnarls.size()-1); default_random_engine generator(time(NULL)); for (size_t k = 0; k < 100 ; k++) { - vector seeds; + vector> all_seeds; + all_seeds.emplace_back(); + all_seeds.emplace_back(); int64_t read_lim = 20;// Distance between read clusters int64_t fragment_lim = 30;// Distance between fragment clusters - for (int j = 0; j < 20; j++) { - //Check clusters of j random positions - const Snarl* snarl1 = allSnarls[randSnarlIndex(generator)]; + for (size_t read = 0 ; read < 2 ; read ++) { + for (int j = 0; j < 20; j++) { + //Check clusters of j random positions + const Snarl* snarl1 = allSnarls[randSnarlIndex(generator)]; - pair, unordered_set> contents1 = - snarl_manager.shallow_contents(snarl1, graph, true); + pair, unordered_set> contents1 = + snarl_manager.shallow_contents(snarl1, graph, true); - vector nodes1 (contents1.first.begin(), contents1.first.end()); + vector nodes1 (contents1.first.begin(), contents1.first.end()); - uniform_int_distribution randNodeIndex1(0,nodes1.size()-1); + uniform_int_distribution randNodeIndex1(0,nodes1.size()-1); - id_t nodeID1 = nodes1[randNodeIndex1(generator)]; - handle_t node1 = graph.get_handle(nodeID1); + id_t nodeID1 = nodes1[randNodeIndex1(generator)]; + handle_t node1 = graph.get_handle(nodeID1); - off_t offset1 = uniform_int_distribution(0,graph.get_length(node1) - 1)(generator); + off_t offset1 = uniform_int_distribution(0,graph.get_length(node1) - 1)(generator); - pos_t pos = make_pos_t(nodeID1, - uniform_int_distribution(0,1)(generator) == 0,offset1 ); - seeds.push_back(pos); + pos_t pos = make_pos_t(nodeID1, + uniform_int_distribution(0,1)(generator) == 0,offset1 ); + all_seeds[read].push_back(pos); + } } - tuple>, vector>> paired_clusters = - clusterer.cluster_seeds(seeds, read_lim, fragment_lim); - vector> read_clusters = std::get<0>(paired_clusters); + tuple>>, vector>> paired_clusters = + clusterer.cluster_seeds(all_seeds, read_lim, fragment_lim); + vector>> read_clusters = std::get<0>(paired_clusters); vector> fragment_clusters = std::get<1>(paired_clusters); + vector ordered_seeds (all_seeds[0]); + for (pos_t s : all_seeds[1]){ + ordered_seeds.push_back(s); + } - for (size_t a = 0; a < read_clusters.size(); a++) { - // For each cluster -cluster this cluster to ensure that - // there is only one - vector clust = read_clusters[a]; - - structures::UnionFind new_clusters (clust.size(), false); - - for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { - pos_t pos1 = seeds[clust[i1]]; - size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false)); - pos_t rev1 = make_pos_t(get_id(pos1), - !is_rev(pos1), - len1 - get_offset(pos1)-1); - - for (size_t b = 0 ; b < read_clusters.size() ; b++) { - if (b != a) { - //For each other cluster - vector clust2 = read_clusters[b]; - for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) { - //And each position in each other cluster, - //make sure that this position is far away from i1 - pos_t pos2 = seeds[clust2[i2]]; - size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); - pos_t rev2 = make_pos_t(get_id(pos2), - !is_rev(pos2), - len2 - get_offset(pos2)-1); - - int64_t dist1 = dist_index.minDistance(pos1, pos2); - int64_t dist2 = dist_index.minDistance(pos1, rev2); - int64_t dist3 = dist_index.minDistance(rev1, pos2); - int64_t dist4 = dist_index.minDistance(rev1, rev2); - int64_t dist = MinimumDistanceIndex::minPos({dist1, - dist2, dist3, dist4}); - if ( dist != -1 && dist <= read_lim) { - dist_index.printSelf(); - graph.serialize_to_file("testGraph"); - cerr << "These should have been in the same read cluster: " ; - cerr << pos1 << " and " << pos2 << endl; - cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl; - REQUIRE(false); + for (size_t read_num = 0 ; read_num <= 2 ; read_num ++) { + auto& one_read_clusters = read_clusters[read_num]; + for (size_t a = 0; a < one_read_clusters.size(); a++) { + // For each cluster -cluster this cluster to ensure that + // there is only one + vector clust = one_read_clusters[a]; + + structures::UnionFind new_clusters (clust.size(), false); + + for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { + pos_t pos1 = all_seeds[read_num][clust[i1]]; + size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false)); + pos_t rev1 = make_pos_t(get_id(pos1), + !is_rev(pos1), + len1 - get_offset(pos1)-1); + + for (size_t b = 0 ; b < one_read_clusters.size() ; b++) { + if (b != a) { + //For each other cluster + vector clust2 = one_read_clusters[b]; + for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) { + //And each position in each other cluster, + //make sure that this position is far away from i1 + pos_t pos2 = all_seeds[read_num][clust2[i2]]; + size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); + pos_t rev2 = make_pos_t(get_id(pos2), + !is_rev(pos2), + len2 - get_offset(pos2)-1); + + int64_t dist1 = dist_index.minDistance(pos1, pos2); + int64_t dist2 = dist_index.minDistance(pos1, rev2); + int64_t dist3 = dist_index.minDistance(rev1, pos2); + int64_t dist4 = dist_index.minDistance(rev1, rev2); + int64_t dist = MinimumDistanceIndex::minPos({dist1, + dist2, dist3, dist4}); + if ( dist != -1 && dist <= read_lim) { + dist_index.printSelf(); + graph.serialize_to_file("testGraph"); + cerr << "These should have been in the same read cluster: " ; + cerr << pos1 << " and " << pos2 << endl; + cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl; + REQUIRE(false); + } + } - } } - } - for (size_t i2 = 0 ; i2 < clust.size() ; i2++) { - //For each position in the same cluster - pos_t pos2 = seeds[clust[i2]]; - size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); - pos_t rev2 = make_pos_t(get_id(pos2), - !is_rev(pos2), - len2 - get_offset(pos2)-1); - int64_t dist1 = dist_index.minDistance(pos1, pos2); - int64_t dist2 = dist_index.minDistance(pos1, rev2); - int64_t dist3 = dist_index.minDistance(rev1, pos2); - int64_t dist4 = dist_index.minDistance(rev1, rev2); - int64_t dist = MinimumDistanceIndex::minPos({dist1, - dist2, dist3, dist4}); - if ( dist != -1 && dist <= read_lim) { - new_clusters.union_groups(i1, i2); - } + for (size_t i2 = 0 ; i2 < clust.size() ; i2++) { + //For each position in the same cluster + pos_t pos2 = all_seeds[read_num][clust[i2]]; + size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); + pos_t rev2 = make_pos_t(get_id(pos2), + !is_rev(pos2), + len2 - get_offset(pos2)-1); + int64_t dist1 = dist_index.minDistance(pos1, pos2); + int64_t dist2 = dist_index.minDistance(pos1, rev2); + int64_t dist3 = dist_index.minDistance(rev1, pos2); + int64_t dist4 = dist_index.minDistance(rev1, rev2); + int64_t dist = MinimumDistanceIndex::minPos({dist1, + dist2, dist3, dist4}); + if ( dist != -1 && dist <= read_lim) { + new_clusters.union_groups(i1, i2); + } + } } - } - auto actual_clusters = new_clusters.all_groups(); - if (actual_clusters.size() != 1) { - dist_index.printSelf(); - graph.serialize_to_file("testGraph"); - cerr << "These should be different read clusters: " << endl; - for (auto c : actual_clusters) { - cerr << "cluster: " ; - for (size_t i1 : c) { - cerr << seeds[clust[i1]] << " "; + auto actual_clusters = new_clusters.all_groups(); + if (actual_clusters.size() != 1) { + dist_index.printSelf(); + graph.serialize_to_file("testGraph"); + cerr << "These should be different read clusters: " << endl; + for (auto c : actual_clusters) { + cerr << "cluster: " ; + for (size_t i1 : c) { + cerr << all_seeds[read_num][clust[i1]] << " "; + } + cerr << endl; } - cerr << endl; } + REQUIRE(actual_clusters.size() == 1); } - REQUIRE(actual_clusters.size() == 1); } for (size_t a = 0; a < fragment_clusters.size(); a++) { // For each cluster -cluster this cluster to ensure that @@ -959,7 +982,7 @@ namespace unittest { structures::UnionFind new_clusters (clust.size(), false); for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { - pos_t pos1 = seeds[clust[i1]]; + pos_t pos1 = ordered_seeds[clust[i1]]; size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false)); pos_t rev1 = make_pos_t(get_id(pos1), !is_rev(pos1), @@ -972,7 +995,7 @@ namespace unittest { for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) { //And each position in each other cluster, //make sure that this position is far away from i1 - pos_t pos2 = seeds[clust2[i2]]; + pos_t pos2 = ordered_seeds[clust2[i2]]; size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); pos_t rev2 = make_pos_t(get_id(pos2), !is_rev(pos2), @@ -998,7 +1021,7 @@ namespace unittest { } for (size_t i2 = 0 ; i2 < clust.size() ; i2++) { //For each position in the same cluster - pos_t pos2 = seeds[clust[i2]]; + pos_t pos2 = ordered_seeds[clust[i2]]; size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); pos_t rev2 = make_pos_t(get_id(pos2), !is_rev(pos2), @@ -1023,7 +1046,7 @@ namespace unittest { for (auto c : actual_clusters) { cerr << "cluster: " ; for (size_t i1 : c) { - cerr << seeds[clust[i1]] << " "; + cerr << ordered_seeds[clust[i1]] << " "; } cerr << endl; } From e0108cb8c96d3832579c5d2cf98411a069a287c1 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Thu, 7 Nov 2019 08:43:30 -0800 Subject: [PATCH 28/79] Made debug code compile --- src/seed_clusterer.cpp | 96 ++++++++++++++++++++------------- src/unittest/seed_clusterer.cpp | 40 ++++++-------- 2 files changed, 77 insertions(+), 59 deletions(-) diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index ede84c05f36..01884bb2518 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -98,6 +98,7 @@ cerr << endl << "New cluster calculation:" << endl; } cerr << endl; } + cerr << endl; } vector ordered_seeds; for (size_t i = 0 ; i < tree_state.all_seeds->size() ; i++) { @@ -305,11 +306,11 @@ cerr << endl << "New cluster calculation:" << endl; //seeds on this node must be in the same cluster for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { - if (tree_state.node_to_seeds[read_num].size() > 0) { - auto seed_range_start = std::lower_bound( - tree_state.node_to_seeds[read_num].begin(), - tree_state.node_to_seeds[read_num].end(), - std::pair(node_id, 0)); + auto seed_range_start = std::lower_bound( + tree_state.node_to_seeds[read_num].begin(), + tree_state.node_to_seeds[read_num].end(), + std::pair(node_id, 0)); + if (seed_range_start != tree_state.node_to_seeds[read_num].end()) { size_t group_id = seed_range_start->second; size_t fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; @@ -382,23 +383,25 @@ cerr << endl << "New cluster calculation:" << endl; vector> seed_offsets; for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { // for all seeds - auto seed_range_start = std::lower_bound( - tree_state.node_to_seeds[read_num].begin(), - tree_state.node_to_seeds[read_num].end(), - std::pair(node_id, 0)); - for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { - //For each seed, find its offset - pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; - int64_t offset = is_rev(seed) ? node_length - get_offset(seed) - : get_offset(seed) + 1; - - node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left); - node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right); - node_clusters.read_best_left[read_num] = min_not_minus_one(offset, node_clusters.read_best_left[read_num]); - node_clusters.read_best_right[read_num] = min_not_minus_one(node_length-offset+1, node_clusters.read_best_right[read_num]); - - seed_offsets.emplace_back(read_num, iter->second, offset); + auto seed_range_start = std::lower_bound( + tree_state.node_to_seeds[read_num].begin(), + tree_state.node_to_seeds[read_num].end(), + std::pair(node_id, 0)); + if (seed_range_start != tree_state.node_to_seeds[read_num].end()) { + for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { + //For each seed, find its offset + pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; + int64_t offset = is_rev(seed) ? node_length - get_offset(seed) + : get_offset(seed) + 1; + + node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left); + node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right); + node_clusters.read_best_left[read_num] = min_not_minus_one(offset, node_clusters.read_best_left[read_num]); + node_clusters.read_best_right[read_num] = min_not_minus_one(node_length-offset+1, node_clusters.read_best_right[read_num]); + + seed_offsets.emplace_back(read_num, iter->second, offset); + } } } //Sort seeds by their position in the node @@ -469,7 +472,9 @@ cerr << endl << "New cluster calculation:" << endl; } } for (size_t i = 0 ; i < read_last_cluster.size() ; i++) { - node_clusters.read_cluster_heads.emplace(i, read_last_cluster[i]); + if (read_last_cluster[i] != -1) { + node_clusters.read_cluster_heads.emplace(i, read_last_cluster[i]); + } } #ifdef DEBUG_CLUSTER @@ -478,18 +483,16 @@ cerr << endl << "New cluster calculation:" << endl; bool got_left = false; bool got_right = false; - for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { - for (pair c : node_clusters.read_cluster_heads) { - pair dists = tree_state.read_cluster_dists[c.first][c.second]; - assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]); - assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]); - assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left); - assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right); - if (dists.first == node_clusters.fragment_best_left) {got_left = true;} - if (dists.second == node_clusters.fragment_best_right) {got_right = true;} - cerr << "\t" << c.first << ":"< c : node_clusters.read_cluster_heads) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[c.first]); + assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[c.first]); + assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left); + assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right); + if (dists.first == node_clusters.fragment_best_left) {got_left = true;} + if (dists.second == node_clusters.fragment_best_right) {got_right = true;} + cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; } } + assert(has_seeds); cerr << endl; } cerr << endl; @@ -823,7 +830,15 @@ cerr << " Combining this cluster from the right" << endl; } else { //Cluster tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second); - combined_cluster[read_num] = tree_state.read_union_find[read_num].find_group(cluster_head.second); + size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second); + + if (new_group == cluster_head.second) { + to_erase.emplace_back(read_num,combined_cluster[read_num]); + } else { + to_erase.push_back(cluster_head); + } + + combined_cluster[read_num] = new_group; combined_left[read_num] = min_not_minus_one(combined_left[read_num], snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left); combined_right[read_num] = min_not_minus_one(combined_right[read_num],snarl_dists.second); @@ -969,11 +984,14 @@ cerr << " Combining this cluster from the right" << endl; pair dists = tree_state.read_cluster_dists[c.first][c.second]; cerr << "\t\tleft: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t\t"; + bool has_seeds = false; for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; } } + assert (has_seeds); cerr << endl; } #endif @@ -1069,12 +1087,15 @@ cerr << " Combining this cluster from the right" << endl; cerr << "best left : " << chain_clusters.fragment_best_left << " best right : " << chain_clusters.fragment_best_right << endl; for (pair c : chain_clusters.read_cluster_heads) { - cerr << "\t"; + cerr << "\tcluster " << c.first << ":" << c.second; + bool has_seeds = false; for (size_t x = 0 ; x < tree_state.all_seeds->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; } } + assert(has_seeds); cerr << endl; } bool got_left = false; @@ -1456,11 +1477,14 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t"; + bool has_seeds = false; for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; } } + assert(has_seeds); cerr << endl; } assert(got_left); diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp index 25d7a287e37..7afbd2a851b 100644 --- a/src/unittest/seed_clusterer.cpp +++ b/src/unittest/seed_clusterer.cpp @@ -130,7 +130,7 @@ namespace unittest { } vector seeds1; for (id_t n : seed_nodes1) { - seeds.push_back(make_pos_t(n, false, 0)); + seeds1.push_back(make_pos_t(n, false, 0)); } vector> all_seeds; all_seeds.push_back(seeds); @@ -142,41 +142,35 @@ namespace unittest { vector>> read_clusters = std::get<0>(paired_clusters); //Should be [[[0,1,2]],[[3,4,5,6]]] vector> fragment_clusters = std::get<1>(paired_clusters); - vector> cluster_sets; + vector> read_set_1; for (vector v : read_clusters[0]) { hash_set h; for (size_t s : v) { h.insert(s); } - cluster_sets.push_back(h); + read_set_1.push_back(h); } + vector> read_set_2; for (vector v : read_clusters[1]) { hash_set h; for (size_t s : v) { h.insert(s); } - cluster_sets.push_back(h); + read_set_2.push_back(h); } REQUIRE( read_clusters.size() == 2); - REQUIRE( (read_clusters[0].size() == 3 || read_clusters[1].size() == 3)); - REQUIRE( (read_clusters[0].size() == 4 || read_clusters[1].size() == 4)); + REQUIRE( (read_clusters[0][0].size() == 3 || read_clusters[1][0].size() == 3)); + REQUIRE( (read_clusters[0][0].size() == 4 || read_clusters[1][0].size() == 4)); REQUIRE( fragment_clusters.size() == 1); - REQUIRE (((cluster_sets[0].count(0) == 1 && - cluster_sets[0].count(1) == 1 && - cluster_sets[0].count(2) == 1 && - cluster_sets[1].count(3) == 1 && - cluster_sets[1].count(4) == 1 && - cluster_sets[1].count(5) == 1 && - cluster_sets[1].count(6) == 1 ) || - - ( cluster_sets[1].count(0) == 1 && - cluster_sets[1].count(1) == 1 && - cluster_sets[1].count(2) == 1 && - cluster_sets[0].count(3) == 1 && - cluster_sets[0].count(4) == 1 && - cluster_sets[0].count(5) == 1 && - cluster_sets[0].count(6) == 1 ))); - + REQUIRE ( read_set_1.size() == 1); + REQUIRE (( read_set_1[0].count(0) == 1 && + read_set_1[0].count(1) == 1 && + read_set_1[0].count(2) == 1)); + REQUIRE (read_set_2.size() == 1); + REQUIRE (( read_set_2[0].count(0) == 1 && + read_set_2[0].count(1) == 1 && + read_set_2[0].count(2) == 1 && + read_set_2[0].count(3) == 1 )); } SECTION( "Two fragment clusters" ) { @@ -190,7 +184,7 @@ namespace unittest { } vector seeds1; for (id_t n : seed_nodes1) { - seeds.push_back(make_pos_t(n, false, 0)); + seeds1.push_back(make_pos_t(n, false, 0)); } vector> all_seeds; all_seeds.push_back(seeds); From e23f732d3495156c9f9e54d71d3ab94d40f773d9 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Thu, 7 Nov 2019 16:02:30 -0800 Subject: [PATCH 29/79] Clusterer passes unit tests --- src/seed_clusterer.cpp | 354 +++++++++++++++++++------------- src/seed_clusterer.hpp | 6 +- src/unittest/seed_clusterer.cpp | 149 +++++++------- 3 files changed, 292 insertions(+), 217 deletions(-) diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index 01884bb2518..31fe6e6b7b7 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -26,7 +26,7 @@ namespace vg { * Returns a vector of cluster assignments */ #ifdef DEBUG_CLUSTER -cerr << endl << "New cluster calculation:" << endl; +cerr << endl << endl << endl << endl << "New cluster calculation:" << endl; #endif if (fragment_distance_limit != 0 && fragment_distance_limit < read_distance_limit) { @@ -49,6 +49,7 @@ cerr << endl << "New cluster calculation:" << endl; for (auto& v : all_seeds) seed_count+= v.size(); TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit, seed_count); + //Populate tree_state.node_to_seeds (mapping each node to the seeds it //contains) and snarl_to_nodes_by_level get_nodes(tree_state, snarl_to_nodes_by_level); @@ -74,6 +75,12 @@ cerr << endl << "New cluster calculation:" << endl; move(snarl_to_nodes_by_level[depth - 1]); } +#ifdef DEBUG_CLUSTER +assert(tree_state.read_index_offsets[0] == 0); +for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { + assert (tree_state.read_index_offsets[i] + tree_state.all_seeds->at(i).size() == tree_state.read_index_offsets[i+1]); +} +#endif //Cluster all the snarls at this depth //Also records which snarls are in chains and the parents of these //snarls in tree_state.parent_snarl_to_node @@ -131,7 +138,7 @@ cerr << endl << "New cluster calculation:" << endl; void SnarlSeedClusterer::get_nodes( TreeState& tree_state, vector>>>& - snarl_to_nodes) const { + snarl_to_nodes_by_level) const { // Assign each seed to a node. for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ @@ -145,17 +152,16 @@ cerr << endl << "New cluster calculation:" << endl; } // Assign each node to a snarl. - id_t prev_node = -1; + hash_set seen_nodes; for (auto& read_node :tree_state.node_to_seeds) { for (auto& mapping : read_node) { - if (mapping.first == prev_node) { - continue; + if (seen_nodes.count(mapping.first) < 1) { + seen_nodes.insert( mapping.first); + size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first); + size_t depth = dist_index.snarl_indexes[snarl_i].depth; + snarl_to_nodes_by_level[depth][snarl_i].emplace_back( + NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size())); } - prev_node = mapping.first; - size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first); - size_t depth = dist_index.snarl_indexes[snarl_i].depth; - snarl_to_nodes[depth][snarl_i].emplace_back( - NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size())); } } } @@ -305,15 +311,16 @@ cerr << endl << "New cluster calculation:" << endl; //If the limit is greater than the node length, then all the //seeds on this node must be in the same cluster + size_t fragment_group_id = -1; for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { auto seed_range_start = std::lower_bound( tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end(), std::pair(node_id, 0)); - if (seed_range_start != tree_state.node_to_seeds[read_num].end()) { + if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) { size_t group_id = seed_range_start->second; - size_t fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; + if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { //For each seed on this node, add it to the cluster @@ -350,33 +357,48 @@ cerr << endl << "New cluster calculation:" << endl; if (tree_state.fragment_distance_limit != 0) { fragment_group_id = tree_state.fragment_union_find.find_group(fragment_group_id); - tree_state.fragment_cluster_dists[fragment_group_id] = make_pair(node_clusters.fragment_best_left, - node_clusters.fragment_best_right); } + } + } #ifdef DEBUG_CLUSTER - assert (group_id == tree_state.read_union_find[read_num].find_group(group_id)); - cerr << "Found single cluster on node " << node_id << "with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl; - bool got_left = false; - bool got_right = false; - for (pair c : node_clusters.read_cluster_heads) { + cerr << "Found single cluster on node " << node_id << " with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl; + + bool got_left = false; + bool got_right = false; + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + cerr << " for read num " << read_num << " best left: " << node_clusters.read_best_left[read_num] << " best right: " << node_clusters.read_best_right[read_num] << endl; + bool got_read_left=false; + bool got_read_right = false; + for (pair c : node_clusters.read_cluster_heads) { + if (c.first == read_num) { pair dists = tree_state.read_cluster_dists[c.first][c.second]; + cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; + } + } assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]); assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]); assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left); assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right); if (dists.first == node_clusters.fragment_best_left) {got_left = true;} if (dists.second == node_clusters.fragment_best_right) {got_right = true;} - //if (dists.first == node_clusters.read_best_left[read_num]) {got_all_left[read_num] = true;} - //if (dists.second == node_clusters.read_best_right[read_num]) {got_all_right[read_num] = true;} - cerr << "\t" << c.first << ":"<(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster); fragment_last_cluster = tree_state.fragment_union_find.find_group(std::get<1>(s)+tree_state.read_index_offsets[read_num]); - tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1); fragment_last_offset = std::get<2>(s); } } else { @@ -454,19 +474,16 @@ cerr << endl << "New cluster calculation:" << endl; make_pair(read_last_offset[read_num], node_length - read_last_offset[read_num] + 1); if (tree_state.fragment_distance_limit != 0) { if (fragment_last_offset != -1 && - abs(read_last_offset[read_num] - fragment_last_offset) <= tree_state.fragment_distance_limit) { + abs(std::get<2>(s) - fragment_last_offset) <= tree_state.fragment_distance_limit) { //If this is a new read cluster but the same fragment cluster - int64_t prev_dist_left = tree_state.fragment_cluster_dists[fragment_last_cluster].first; tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster); fragment_last_cluster = tree_state.fragment_union_find.find_group(fragment_last_cluster); - tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1); + fragment_last_offset = std::get<2>(s); } else { //If this is a new fragment cluster as well fragment_last_cluster = std::get<1>(s)+tree_state.read_index_offsets[read_num]; fragment_last_offset = std::get<2>(s); - tree_state.fragment_cluster_dists[fragment_last_cluster] = - make_pair(fragment_last_offset, node_length-fragment_last_offset+1); } } } @@ -480,21 +497,40 @@ cerr << endl << "New cluster calculation:" << endl; #ifdef DEBUG_CLUSTER cerr << "Found read clusters on node " << node_id << endl; + bool got_left = false; bool got_right = false; - - for (pair c : node_clusters.read_cluster_heads) { - pair dists = tree_state.read_cluster_dists[c.first][c.second]; - assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[c.first]); - assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[c.first]); - assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left); - assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right); - if (dists.first == node_clusters.fragment_best_left) {got_left = true;} - if (dists.second == node_clusters.fragment_best_right) {got_right = true;} - cerr << "\t" << c.first << ":"<size() ; read_num++) { + cerr << " for read num " << read_num << " best left: " << node_clusters.read_best_left[read_num] << " best right: " << node_clusters.read_best_right[read_num] << endl; + bool got_read_left=false; + bool got_read_right = false; + for (pair c : node_clusters.read_cluster_heads) { + if (c.first == read_num) { + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; + } + } + assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]); + assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]); + assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left); + assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right); + if (dists.first == node_clusters.fragment_best_left) {got_left = true;} + if (dists.second == node_clusters.fragment_best_right) {got_right = true;} + if (dists.first == node_clusters.read_best_left[read_num]) {got_read_left = true;} + if (dists.second == node_clusters.read_best_right[read_num]) {got_read_right = true;} + cerr << endl; + assert(has_seeds); + } + } + assert(got_read_left || node_clusters.read_best_left[read_num] == -1); + assert(got_read_right || node_clusters.read_best_right[read_num] == -1); } - assert(got_left ); + assert(got_left); assert(got_right); for (pair group_id : node_clusters.read_cluster_heads) { assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second)); @@ -531,7 +567,7 @@ cerr << endl << "New cluster calculation:" << endl; //Used when two clusters in the same snarl can be combined by //looping in the chain - if (read_dist <= tree_state.read_distance_limit) { + if (read_dist != -1 && read_dist <= tree_state.read_distance_limit) { if (combined_group == -1) { combined_group = new_group; } else { @@ -576,12 +612,12 @@ cerr << endl << "New cluster calculation:" << endl; fragment_dist <= tree_state.fragment_distance_limit) { //If these aren't in the same read cluster but are in //the same fragment cluster - if (fragment_combined_group == -1) { - fragment_combined_group = new_group; - } else { - tree_state.fragment_union_find.union_groups(new_group, fragment_combined_group); - fragment_combined_group = tree_state.fragment_union_find.find_group(new_group); + if (fragment_combined_group != -1) { + tree_state.fragment_union_find.union_groups(fragment_combined_group, + new_group + tree_state.read_index_offsets[read_num]); } + fragment_combined_group = tree_state.fragment_union_find.find_group( + new_group + tree_state.read_index_offsets[read_num]); } cerr << endl; return; @@ -765,17 +801,18 @@ cerr << " (Possibly) updating looping distance to right of snarl cluster " << r #endif - if (snarl_clusters.read_best_left[read_num] != -1 && snarl_dists.first != -1 ) { + if (snarl_clusters.fragment_best_left!= -1 && snarl_dists.first != -1 ) { //If this cluster can be combined with another cluster //from the left #ifdef DEBUG_CLUSTER cerr << " Combining this cluster from the left " ; #endif + int64_t read_dist = snarl_clusters.read_best_left[read_num] == -1 ? -1 : + snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1; combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], fragment_snarl_cluster_left, to_erase, snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1, - snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1, - snarl_dists, read_num); + read_dist, snarl_dists, read_num); } } @@ -798,18 +835,19 @@ cerr << "Updating looping distance to left of snarl cluster " << read_num << ":" #endif } - if (snarl_clusters.read_best_right[read_num] != -1 && snarl_dists.second != -1 ) { + if (snarl_clusters.fragment_best_right != -1 && snarl_dists.second != -1 ) { //If this cluster can be combined with another cluster //from the right #ifdef DEBUG_CLUSTER cerr << " Combining this cluster from the right" << endl; #endif + int64_t read_dist = snarl_clusters.read_best_right[read_num] == -1 ? -1 : + snarl_clusters.read_best_right[read_num] + snarl_dists.second + loop_dist_end - end_length - 1; combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num], fragment_snarl_cluster_right, to_erase, snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1, - snarl_clusters.read_best_right[read_num] + snarl_dists.second + loop_dist_end - end_length - 1, - snarl_dists, read_num); + read_dist, snarl_dists, read_num); } } @@ -855,7 +893,7 @@ cerr << " Combining this cluster from the right" << endl; //If the snarl cluster does not get combined with any of //the existing chain clusters, then it becomes a new chain cluster if (tree_state.fragment_distance_limit != 0 && fragment_chain_right != -1 && snarl_dists.first != -1 && - snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.read_distance_limit) { + snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.fragment_distance_limit) { //Cluster in the same fragment but not the same read if (fragment_combined_cluster != -1) { //Also cluster by fragment @@ -1045,7 +1083,10 @@ cerr << " Combining this cluster from the right" << endl; } else { tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second); if (tree_state.fragment_distance_limit != 0) { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second + tree_state.all_seeds->size()); + if (fragment_combined_cluster != -1) { + tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]); + } + fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); } size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second); if (new_group == cluster_head.second) { @@ -1087,35 +1128,47 @@ cerr << " Combining this cluster from the right" << endl; cerr << "best left : " << chain_clusters.fragment_best_left << " best right : " << chain_clusters.fragment_best_right << endl; for (pair c : chain_clusters.read_cluster_heads) { - cerr << "\tcluster " << c.first << ":" << c.second; - bool has_seeds = false; - for (size_t x = 0 ; x < tree_state.all_seeds->size() ; x++) { - if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; - has_seeds = true; - } - } - assert(has_seeds); - cerr << endl; } bool got_left = false; bool got_right = false; - for (pair c : chain_clusters.read_cluster_heads) { - pair dists = tree_state.read_cluster_dists[c.first][c.second]; - if (!chain_index.is_looping_chain){ - assert(dists.first == -1 || dists.first >= chain_clusters.fragment_best_left); - assert(dists.second == -1 || dists.second >= chain_clusters.fragment_best_right); - assert(dists.first == -1 || dists.first >= chain_clusters.read_best_left[c.first]); - assert(dists.second == -1 || dists.second >= chain_clusters.read_best_right[c.first]); + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + cerr << " for read num " << read_num << " best left: " << chain_clusters.read_best_left[read_num] << " best right: " << chain_clusters.read_best_right[read_num] << endl; + bool got_read_left=false; + bool got_read_right = false; + bool any_clusters = false; + for (pair c : chain_clusters.read_cluster_heads) { + if (c.first == read_num) { + any_clusters = true; + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; + } + } + assert(dists.first == -1 || dists.first >= chain_clusters.read_best_left[read_num]); + assert(dists.second == -1 || dists.second >= chain_clusters.read_best_right[read_num]); + assert(dists.first == -1 || dists.first >= chain_clusters.fragment_best_left); + assert(dists.second == -1 || dists.second >= chain_clusters.fragment_best_right); + if (dists.first == chain_clusters.fragment_best_left) {got_left = true;} + if (dists.second == chain_clusters.fragment_best_right) {got_right = true;} + if (dists.first == chain_clusters.read_best_left[read_num]) {got_read_left = true;} + if (dists.second == chain_clusters.read_best_right[read_num]) {got_read_right = true;} + cerr << endl; + assert(has_seeds); + } + } + if (!chain_index.is_looping_chain) { + assert(!any_clusters || got_read_left || chain_clusters.read_best_left[read_num] > tree_state.read_distance_limit || chain_clusters.read_best_left[read_num] == -1); + assert(!any_clusters || got_read_right || chain_clusters.read_best_right[read_num] > tree_state.read_distance_limit || chain_clusters.read_best_right[read_num] == -1); } - if (dists.first == chain_clusters.fragment_best_left) {got_left = true;} - if (dists.second == chain_clusters.fragment_best_right) {got_right = true;} - cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : " - << dists.second << endl; } + if (!chain_index.is_looping_chain) { - assert(got_left); - assert(got_right); + assert(got_left || chain_clusters.fragment_best_left > tree_state.fragment_distance_limit); + assert(got_right ||chain_clusters.fragment_best_right > tree_state.fragment_distance_limit ); } for (pair group_id : chain_clusters.read_cluster_heads) { @@ -1143,14 +1196,13 @@ cerr << " Combining this cluster from the right" << endl; NodeClusters snarl_clusters(tree_state.all_seeds->size()); auto combine_clusters = [&] (size_t& new_group, size_t& combined_group, - size_t& fragment_combined_group, int64_t read_dist, - int64_t fragment_dist, - pair& end_dists, size_t read_num){ + size_t& fragment_combined_group, + int64_t fragment_dist, int64_t read_dist, size_t read_num){ //Helper function to compare and combine clusters in two nodes of the same snarl //If the distance between two clusters is small enough, then combine them //for the read clusters and, if applicable, for the fragment clusters //Updates the distances stored for the read clusters - if (read_dist <= tree_state.read_distance_limit) { + if (read_dist != -1 && read_dist <= tree_state.read_distance_limit) { //If the clusters are close enough to combine in the read if (tree_state.fragment_distance_limit != 0) { if (fragment_combined_group != -1) { @@ -1160,6 +1212,8 @@ cerr << " Combining this cluster from the right" << endl; } fragment_combined_group = tree_state.fragment_union_find.find_group(new_group+tree_state.read_index_offsets[read_num]); } + pair& end_dists = tree_state.read_cluster_dists[read_num][new_group]; + if (combined_group == -1) { snarl_clusters.read_cluster_heads.emplace(read_num,new_group); tree_state.read_cluster_dists[read_num][new_group] = end_dists; @@ -1192,13 +1246,12 @@ cerr << " Combining this cluster from the right" << endl; && fragment_dist <= tree_state.fragment_distance_limit) { //Same fragment - if (fragment_combined_group == -1) { - fragment_combined_group = new_group; - } else { - tree_state.fragment_union_find.union_groups( - new_group + tree_state.read_index_offsets[read_num], fragment_combined_group); - fragment_combined_group = tree_state.fragment_union_find.find_group(new_group); + if (fragment_combined_group != -1) { + //Also combine fragment clusters + tree_state.fragment_union_find.union_groups(new_group+tree_state.read_index_offsets[read_num], + fragment_combined_group); } + fragment_combined_group = tree_state.fragment_union_find.find_group(new_group+tree_state.read_index_offsets[read_num]); } return; }; @@ -1250,7 +1303,7 @@ cerr << " Combining this cluster from the right" << endl; cerr << "Node rank is " << node_rank << " fwd, " << rev_rank << " rev of " << snarl_index.num_nodes * 2 << endl; cerr << "Clusters at this child:" << endl; - for (pair c : child_nodes[i].second.read_cluster_heads) { + for (pair c : curr_child_clusters.read_cluster_heads) { cerr << "\tdist left: " << tree_state.read_cluster_dists[c.first][c.second].first << " dist right: " << tree_state.read_cluster_dists[c.first][c.second].second << endl; cerr << "\t\t"; @@ -1369,8 +1422,6 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank pair child_cluster_head = children_i[c_i]; size_t read_num = child_cluster_head.first; size_t c_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second); - - pair new_dists = tree_state.read_cluster_dists[read_num][c_group]; pair dists_c = old_dists[child_cluster_head]; @@ -1378,33 +1429,37 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank && other_node_clusters.fragment_best_left != -1 ) { //If cluster child_cluster_head can be combined with clusters in j //from the left of both of them + int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 : + dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1; combine_clusters(c_group, group_l_l[read_num], fragment_group_l_l, dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1, - dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1, - new_dists, read_num); + read_dist, read_num); } if (dist_l_r != -1 && dists_c.first != -1 && other_node_clusters.fragment_best_right != -1 ) { //If it can be combined from the left to the right of j + int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 : + dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1; combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r, dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1, - dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1, - new_dists, read_num); + read_dist, read_num); } if (dist_r_l != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_left != -1 ) { + int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 : + dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1; combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l, dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1, - dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1, - new_dists, read_num); + read_dist, read_num); } if (dist_r_r != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_right != -1 ) { + int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 : + dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1; combine_clusters(c_group, group_r_r[read_num], fragment_group_r_r, dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1, - dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1, - new_dists, read_num); + read_dist, read_num); } } @@ -1422,42 +1477,46 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank //snarl_cluster heads pair child_cluster_head = children_j[k_i]; size_t read_num = child_cluster_head.first; - pair& dist_bounds_k = old_dists[child_cluster_head]; + pair& dists_k = old_dists[child_cluster_head]; size_t k_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second); - pair dists_k = tree_state.read_cluster_dists[read_num][k_group]; - if (dist_l_l != -1 && curr_child_clusters.read_best_left[read_num] != -1 - && dist_bounds_k.first != -1 ){ + if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1 + && dists_k.first != -1 ){ + int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 : + dist_l_l + curr_child_clusters.read_best_left[read_num] + dists_k.first-1; combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l, - dist_l_l + curr_child_clusters.fragment_best_left + dist_bounds_k.first-1, - dist_l_l + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.first-1, - dists_k, read_num); + dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1, + read_dist, read_num); } - if (dist_l_r != -1 && curr_child_clusters.read_best_left[read_num] != -1 - && dist_bounds_k.second != -1 ) { + if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1 + && dists_k.second != -1 ) { + + int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 : + dist_l_r + curr_child_clusters.read_best_left[read_num] + dists_k.second-1; combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r, - dist_l_r + curr_child_clusters.fragment_best_left + dist_bounds_k.second-1, - dist_l_r + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.second-1, - dists_k, read_num); + dist_l_r + curr_child_clusters.fragment_best_left + dists_k.second-1, + read_dist, read_num); } - if (dist_r_l != -1 && curr_child_clusters.read_best_right[read_num] != -1 - && dist_bounds_k.first != -1 ) { + if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1 + && dists_k.first != -1 ) { + int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 : + dist_r_l + curr_child_clusters.read_best_right[read_num] + dists_k.first-1; combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l, - dist_r_l + curr_child_clusters.fragment_best_right + dist_bounds_k.first-1, - dist_r_l + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.first-1, - dists_k,read_num); + dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1, + read_dist, read_num); } - if (dist_r_r != -1 && curr_child_clusters.read_best_right[read_num] != -1 - && dist_bounds_k.second != -1 ) { + if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1 + && dists_k.second != -1 ) { + int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 : + dist_r_r + curr_child_clusters.read_best_right[read_num] + dists_k.second-1; combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r, - dist_r_r + curr_child_clusters.fragment_best_right + dist_bounds_k.second-1, - dist_r_r + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.second-1, - dists_k, read_num); + dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1, + read_dist, read_num); } } } @@ -1470,22 +1529,37 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank << snarl_clusters.fragment_best_right << endl; bool got_left = false; bool got_right = false; - for (pair c : snarl_clusters.read_cluster_heads) { - pair dists = tree_state.read_cluster_dists[c.first][c.second]; - if (dists.first == snarl_clusters.fragment_best_left) {got_left = true;} - if (dists.second == snarl_clusters.fragment_best_right) {got_right = true;} - cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : " - << dists.second << endl; - cerr << "\t\t"; - bool has_seeds = false; - for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { - if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; - has_seeds = true; + for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { + cerr << " for read num " << read_num << " best left: " << snarl_clusters.read_best_left[read_num] << " best right: " << snarl_clusters.read_best_right[read_num] << endl; + bool got_read_left=false; + bool got_read_right = false; + bool any_clusters = false; + for (pair c : snarl_clusters.read_cluster_heads) { + if (c.first == read_num) { + any_clusters = true; + pair dists = tree_state.read_cluster_dists[c.first][c.second]; + cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + if (tree_state.read_union_find[c.first].find_group(x) == c.second) { + cerr << tree_state.all_seeds->at(c.first)[x] << " "; + has_seeds = true; + } + } + assert(dists.first == -1 || dists.first >= snarl_clusters.read_best_left[read_num]); + assert(dists.second == -1 || dists.second >= snarl_clusters.read_best_right[read_num]); + assert(dists.first == -1 || dists.first >= snarl_clusters.fragment_best_left); + assert(dists.second == -1 || dists.second >= snarl_clusters.fragment_best_right); + if (dists.first == snarl_clusters.fragment_best_left) {got_left = true;} + if (dists.second == snarl_clusters.fragment_best_right) {got_right = true;} + if (dists.first == snarl_clusters.read_best_left[read_num]) {got_read_left = true;} + if (dists.second == snarl_clusters.read_best_right[read_num]) {got_read_right = true;} + cerr << endl; + assert(has_seeds); } } - assert(has_seeds); - cerr << endl; + assert(!any_clusters ||got_read_left || snarl_clusters.read_best_left[read_num] == -1); + assert(!any_clusters ||got_read_right || snarl_clusters.read_best_right[read_num] == -1); } assert(got_left); assert(got_right); diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp index d90410798ae..f95f0261ae9 100644 --- a/src/seed_clusterer.hpp +++ b/src/seed_clusterer.hpp @@ -157,7 +157,6 @@ class SnarlSeedClusterer { //These values are only relevant for seeds that represent a cluster //in union_find_reads vector>> read_cluster_dists; - vector> fragment_cluster_dists; @@ -196,12 +195,11 @@ class SnarlSeedClusterer { TreeState (vector>* all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit, size_t seed_count) : all_seeds(all_seeds), - fragment_cluster_dists(all_seeds->size(), make_pair(-1, -1)), read_distance_limit(read_distance_limit), fragment_distance_limit(fragment_distance_limit), - fragment_union_find (seed_count, false) { + fragment_union_find (seed_count, false), + read_index_offsets(1,0){ - read_index_offsets.push_back(0); size_t total_seeds = 0; for (vector& v : *all_seeds) { total_seeds += v.size(); diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp index 7afbd2a851b..8e107e8e99d 100644 --- a/src/unittest/seed_clusterer.cpp +++ b/src/unittest/seed_clusterer.cpp @@ -472,7 +472,7 @@ namespace unittest { vector>fragment_clusters = std::get<1>(paired_clusters); REQUIRE( read_clusters.size() == 1); - REQUIRE( read_clusters[0].size() == 1); + REQUIRE( read_clusters[0].size() == 4); REQUIRE( fragment_clusters.size() == 4); //New fragment clusters @@ -825,7 +825,7 @@ namespace unittest { TEST_CASE("Random graphs", "[cluster]"){ - for (int i = 0; i < 0; i++) { + for (int i = 0; i < 1000; i++) { // For each random graph VG graph; random_graph(1000, 20, 100, &graph); @@ -886,86 +886,89 @@ namespace unittest { ordered_seeds.push_back(s); } - for (size_t read_num = 0 ; read_num <= 2 ; read_num ++) { + for (size_t read_num = 0 ; read_num < 2 ; read_num ++) { auto& one_read_clusters = read_clusters[read_num]; - for (size_t a = 0; a < one_read_clusters.size(); a++) { - // For each cluster -cluster this cluster to ensure that - // there is only one - vector clust = one_read_clusters[a]; - - structures::UnionFind new_clusters (clust.size(), false); - - for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { - pos_t pos1 = all_seeds[read_num][clust[i1]]; - size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false)); - pos_t rev1 = make_pos_t(get_id(pos1), - !is_rev(pos1), - len1 - get_offset(pos1)-1); - - for (size_t b = 0 ; b < one_read_clusters.size() ; b++) { - if (b != a) { - //For each other cluster - vector clust2 = one_read_clusters[b]; - for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) { - //And each position in each other cluster, - //make sure that this position is far away from i1 - pos_t pos2 = all_seeds[read_num][clust2[i2]]; - size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); - pos_t rev2 = make_pos_t(get_id(pos2), - !is_rev(pos2), - len2 - get_offset(pos2)-1); - - int64_t dist1 = dist_index.minDistance(pos1, pos2); - int64_t dist2 = dist_index.minDistance(pos1, rev2); - int64_t dist3 = dist_index.minDistance(rev1, pos2); - int64_t dist4 = dist_index.minDistance(rev1, rev2); - int64_t dist = MinimumDistanceIndex::minPos({dist1, - dist2, dist3, dist4}); - if ( dist != -1 && dist <= read_lim) { - dist_index.printSelf(); - graph.serialize_to_file("testGraph"); - cerr << "These should have been in the same read cluster: " ; - cerr << pos1 << " and " << pos2 << endl; - cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl; - REQUIRE(false); + if (one_read_clusters.size() > 0) { + for (size_t a = 0; a < one_read_clusters.size(); a++) { + // For each cluster -cluster this cluster to ensure that + // there is only one + cerr << a << " of " << one_read_clusters.size() << endl; + vector clust = one_read_clusters[a]; + + structures::UnionFind new_clusters (clust.size(), false); + + for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { + pos_t pos1 = all_seeds[read_num][clust[i1]]; + size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false)); + pos_t rev1 = make_pos_t(get_id(pos1), + !is_rev(pos1), + len1 - get_offset(pos1)-1); + + for (size_t b = 0 ; b < one_read_clusters.size() ; b++) { + if (b != a) { + //For each other cluster + vector clust2 = one_read_clusters[b]; + for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) { + //And each position in each other cluster, + //make sure that this position is far away from i1 + pos_t pos2 = all_seeds[read_num][clust2[i2]]; + size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); + pos_t rev2 = make_pos_t(get_id(pos2), + !is_rev(pos2), + len2 - get_offset(pos2)-1); + + int64_t dist1 = dist_index.minDistance(pos1, pos2); + int64_t dist2 = dist_index.minDistance(pos1, rev2); + int64_t dist3 = dist_index.minDistance(rev1, pos2); + int64_t dist4 = dist_index.minDistance(rev1, rev2); + int64_t dist = MinimumDistanceIndex::minPos({dist1, + dist2, dist3, dist4}); + if ( dist != -1 && dist <= read_lim) { + dist_index.printSelf(); + graph.serialize_to_file("testGraph"); + cerr << "These should have been in the same read cluster: " ; + cerr << pos1 << " and " << pos2 << endl; + cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl; + REQUIRE(false); + } + } - } } - } - for (size_t i2 = 0 ; i2 < clust.size() ; i2++) { - //For each position in the same cluster - pos_t pos2 = all_seeds[read_num][clust[i2]]; - size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); - pos_t rev2 = make_pos_t(get_id(pos2), - !is_rev(pos2), - len2 - get_offset(pos2)-1); - int64_t dist1 = dist_index.minDistance(pos1, pos2); - int64_t dist2 = dist_index.minDistance(pos1, rev2); - int64_t dist3 = dist_index.minDistance(rev1, pos2); - int64_t dist4 = dist_index.minDistance(rev1, rev2); - int64_t dist = MinimumDistanceIndex::minPos({dist1, - dist2, dist3, dist4}); - if ( dist != -1 && dist <= read_lim) { - new_clusters.union_groups(i1, i2); - } + for (size_t i2 = 0 ; i2 < clust.size() ; i2++) { + //For each position in the same cluster + pos_t pos2 = all_seeds[read_num][clust[i2]]; + size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false)); + pos_t rev2 = make_pos_t(get_id(pos2), + !is_rev(pos2), + len2 - get_offset(pos2)-1); + int64_t dist1 = dist_index.minDistance(pos1, pos2); + int64_t dist2 = dist_index.minDistance(pos1, rev2); + int64_t dist3 = dist_index.minDistance(rev1, pos2); + int64_t dist4 = dist_index.minDistance(rev1, rev2); + int64_t dist = MinimumDistanceIndex::minPos({dist1, + dist2, dist3, dist4}); + if ( dist != -1 && dist <= read_lim) { + new_clusters.union_groups(i1, i2); + } + } } - } - auto actual_clusters = new_clusters.all_groups(); - if (actual_clusters.size() != 1) { - dist_index.printSelf(); - graph.serialize_to_file("testGraph"); - cerr << "These should be different read clusters: " << endl; - for (auto c : actual_clusters) { - cerr << "cluster: " ; - for (size_t i1 : c) { - cerr << all_seeds[read_num][clust[i1]] << " "; + auto actual_clusters = new_clusters.all_groups(); + if (actual_clusters.size() != 1) { + dist_index.printSelf(); + graph.serialize_to_file("testGraph"); + cerr << "These should be different read clusters: " << endl; + for (auto c : actual_clusters) { + cerr << "cluster: " ; + for (size_t i1 : c) { + cerr << all_seeds[read_num][clust[i1]] << " "; + } + cerr << endl; } - cerr << endl; } + REQUIRE(actual_clusters.size() == 1); } - REQUIRE(actual_clusters.size() == 1); } } for (size_t a = 0; a < fragment_clusters.size(); a++) { From 645941cfea508fef37a372c165a989b148bd35f1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 8 Nov 2019 12:43:06 -0500 Subject: [PATCH 30/79] add probablistic support caller --- src/algorithms/coverage_depth.cpp | 29 ++ src/algorithms/coverage_depth.hpp | 17 +- src/graph_caller.cpp | 69 ++++- src/graph_caller.hpp | 17 +- src/snarl_caller.cpp | 428 ++++++++++++++++++++++++++---- src/snarl_caller.hpp | 155 ++++++++--- src/subcommand/call_main.cpp | 37 ++- src/traversal_support.cpp | 22 +- src/traversal_support.hpp | 7 +- 9 files changed, 667 insertions(+), 114 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index b68b8c50b55..0d63f656cb9 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -130,6 +130,35 @@ vector> binned_packed_depth(const Packer& return binned_depths; } +unordered_map>> binned_packed_depth_index(const Packer& packer, + const vector& path_names, + size_t bin_size, + size_t min_coverage, + bool include_deletions, + bool std_err) { + unordered_map>> depth_index; + for (const string& path_name : path_names) { + vector> binned_depths = binned_packed_depth(packer, path_name, bin_size, + min_coverage, include_deletions); + // todo: probably more efficent to just leave in sorted vector + map>& depth_map = depth_index[path_name]; + for (auto& binned_depth : binned_depths) { + double var = get<3>(binned_depth); + // optionally convert variance to standard error + if (std_err) { + var = sqrt(var / (double)(get<1>(binned_depth) - get<2>(binned_depth))); + } + depth_map[get<0>(binned_depth)] = make_pair(get<2>(binned_depth), var); + } + } + + return depth_index; +} + +const pair& get_depth_from_index(const unordered_map>>& depth_index, + const string& path_name, size_t offset) { + return depth_index.at(path_name).lower_bound(offset)->second; +} // draw (roughly) max_nodes nodes from the graph using the random seed static unordered_map sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) { diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp index 730cd316f5a..9084949df36 100644 --- a/src/algorithms/coverage_depth.hpp +++ b/src/algorithms/coverage_depth.hpp @@ -11,7 +11,6 @@ #include "handle.hpp" #include "packer.hpp" - namespace vg { namespace algorithms { @@ -35,7 +34,21 @@ pair packed_depth_of_bin(const Packer& packer, step_handle_t sta /// Use all available threads to estimate the binned packed coverage of a path using above fucntion /// Each element is a bin's 0-based open-ended interval in the path, and its coverage mean,variance. vector> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size, - size_t min_coverage, bool include_deletions); + size_t min_coverage, bool include_deletions); + +/// Use the above function to retrieve the binned depths of a list of paths, and store them indexed by start +/// coordinate. If std_err is true, store instead of +using BinnedDepthIndex = unordered_map>>; +BinnedDepthIndex binned_packed_depth_index(const Packer& packer, + const vector& path_names, + size_t bin_size, + size_t min_coverage, + bool include_deletions, + bool std_err); + +/// Query index created above +/// Todo: optionally smooth over adjacent bins? +const pair& get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t offset); /// Return the mean and variance of coverage of randomly sampled nodes from a GAM /// Nodes with less than min_coverage are ignored diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 4387ef0b63d..55e8d097759 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -143,8 +143,12 @@ bool VCFGenotyper::call_snarl(const Snarl& snarl) { } } + // find a path range corresponding to our snarl by way of the VCF variants. + tuple ref_positions = get_ref_positions(variants); + // use our support caller to choose our genotype (int traversal coordinates) - vector trav_genotype = snarl_caller.genotype(snarl, travs, ref_trav_idx, 2); + vector trav_genotype = snarl_caller.genotype(snarl, travs, ref_trav_idx, 2, get<0>(ref_positions), + make_pair(get<1>(ref_positions), get<2>(ref_positions))); assert(trav_genotype.empty() || trav_genotype.size() == 2); @@ -231,6 +235,35 @@ string VCFGenotyper::vcf_header(const PathHandleGraph& graph, const vector VCFGenotyper::get_ref_positions(const vector& variants) const { + // if there is more than one path in our snarl (unlikely for most graphs we'll vcf-genoetype) + // then we return the one with the biggest interval + map> path_offsets; + for (const vcflib::Variant* var : variants) { + if (path_offsets.count(var->sequenceName)) { + pair& record = path_offsets[var->ref]; + record.first = std::min((size_t)var->position, record.first); + record.second = std::max((size_t)var->position + var->ref.length(), record.second); + } else { + path_offsets[var->sequenceName] = make_pair(var->position, var->position + var->ref.length()); + } + } + + string ref_path; + size_t ref_range_size = 0; + pair ref_range; + for (auto& path_offset : path_offsets) { + size_t len = path_offset.second.second - path_offset.second.first; + if (len > ref_range_size) { + ref_range_size = len; + ref_path = path_offset.first; + ref_range = path_offset.second; + } + } + + return make_tuple(ref_path, ref_range.first, ref_range.second); +} + unordered_map VCFGenotyper::scan_contig_lengths() const { unordered_map ref_lengths; @@ -387,7 +420,8 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { string path_name = find_index(snarl, is_vg ? path_indexes : site_path_indexes).first; // orient the snarl along the reference path - if (get_ref_position(snarl, path_name).second == true) { + tuple ref_interval = get_ref_interval(snarl, path_name); + if (get<2>(ref_interval) == true) { snarl_manager.flip(&snarl); } @@ -396,12 +430,14 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { // these integers map the called traversals to their positions in the list of all traversals // of the top level snarl. vector genotype; - std::tie(called_traversals, genotype) = top_down_genotype(snarl, *rep_trav_finder, 2); + std::tie(called_traversals, genotype) = top_down_genotype(snarl, *rep_trav_finder, 2, + path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); if (!called_traversals.empty()) { // regenotype our top-level traversals now that we know they aren't nested, and we have a // good idea of all the sizes - std::tie(called_traversals, genotype) = re_genotype(snarl, *rep_trav_finder, called_traversals, genotype, 2); + std::tie(called_traversals, genotype) = re_genotype(snarl, *rep_trav_finder, called_traversals, genotype, 2, + path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); // emit our vcf variant emit_variant(snarl, *rep_trav_finder, called_traversals, genotype, path_name); @@ -431,13 +467,14 @@ string LegacyCaller::vcf_header(const PathHandleGraph& graph, const vector, vector> LegacyCaller::top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy) const { +pair, vector> LegacyCaller::top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy, + const string& ref_path_name, pair ref_interval) const { // get the traversals through the site vector traversals = trav_finder.find_traversals(snarl); // use our support caller to choose our genotype - vector trav_genotype = snarl_caller.genotype(snarl, traversals, 0, ploidy); + vector trav_genotype = snarl_caller.genotype(snarl, traversals, 0, ploidy, ref_path_name, ref_interval); if (trav_genotype.empty()) { return make_pair(vector(), vector()); } @@ -470,7 +507,7 @@ pair, vector> LegacyCaller::top_down_genotype(const snarl_manager.flip(into_snarl); } vector child_genotype = top_down_genotype(*into_snarl, - trav_finder, hom ? 2: 1).first; + trav_finder, hom ? 2: 1, ref_path_name, ref_interval).first; if (child_genotype.empty()) { return make_pair(vector(), vector()); } @@ -533,9 +570,11 @@ SnarlTraversal LegacyCaller::get_reference_traversal(const Snarl& snarl, Travers pair, vector> LegacyCaller::re_genotype(const Snarl& snarl, TraversalFinder& trav_finder, const vector& in_traversals, const vector& in_genotype, - int ploidy) const { + int ploidy, + const string& ref_path_name, + pair ref_interval) const { assert(in_traversals.size() == in_genotype.size()); - + // create a set of unique traversal candidates that must include the reference first vector rg_traversals; // add our reference traversal to the front @@ -556,7 +595,7 @@ pair, vector> LegacyCaller::re_genotype(const Snarl& } // re-genotype the candidates - vector rg_genotype = snarl_caller.genotype(snarl, rg_traversals, 0, ploidy); + vector rg_genotype = snarl_caller.genotype(snarl, rg_traversals, 0, ploidy, ref_path_name, ref_interval); // convert our output to something that emit_variant() will understand // todo: this is needlessly inefficient and should be streamlined to operate @@ -634,7 +673,7 @@ void LegacyCaller::emit_variant(const Snarl& snarl, TraversalFinder& trav_finder // fill out the rest of the variant out_variant.sequenceName = ref_path_name; // +1 to convert to 1-based VCF - out_variant.position = get_ref_position(snarl, ref_path_name).first + ref_offsets.find(ref_path_name)->second + 1; + out_variant.position = get<0>(get_ref_interval(snarl, ref_path_name)) + ref_offsets.find(ref_path_name)->second + 1; out_variant.id = std::to_string(snarl.start().node_id()) + "_" + std::to_string(snarl.end().node_id()); out_variant.filter = "PASS"; out_variant.updateAlleleIndexes(); @@ -691,7 +730,7 @@ pair LegacyCaller::find_index(const Snarl& snarl, const vect return make_pair("", nullptr); } -pair LegacyCaller::get_ref_position(const Snarl& snarl, const string& ref_path_name) const { +tuple LegacyCaller::get_ref_interval(const Snarl& snarl, const string& ref_path_name) const { path_handle_t path_handle = graph.get_path_handle(ref_path_name); handle_t start_handle = graph.get_handle(snarl.start().node_id(), snarl.start().backward()); @@ -743,7 +782,11 @@ pair LegacyCaller::get_ref_position(const Snarl& snarl, const stri size_t end_position = end_step == end_steps.begin()->second ? end_steps.begin()->first : graph.get_position_of_step(end_step); bool backward = end_position < start_position; - return make_pair(backward ? end_position : start_position, backward); + if (backward) { + return make_tuple(end_position, start_position, backward); + } else { + return make_tuple(start_position, end_position, backward); + } } void LegacyCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool backward) const { diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index e4f1367f2e7..2f4d8c38bbb 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -102,6 +102,9 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller { protected: + /// get path positions bounding a set of variants + tuple get_ref_positions(const vector& variants) const; + /// munge out the contig lengths from the VCF header virtual unordered_map scan_contig_lengths() const; @@ -145,7 +148,8 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller { /// recursively genotype a snarl /// todo: can this be pushed to a more generic class? - pair, vector> top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy) const; + pair, vector> top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy, + const string& ref_path_name, pair ref_interval) const; /// we need the reference traversal for VCF, but if the ref is not called, the above method won't find it. SnarlTraversal get_reference_traversal(const Snarl& snarl, TraversalFinder& trav_finder) const; @@ -153,10 +157,13 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller { /// re-genotype output of top_down_genotype. it may give slightly different results as /// it's working with fully-defined traversals and can exactly determine lengths and supports /// it will also make sure the reference traversal is in the beginning of the output - pair, vector> re_genotype(const Snarl& snarl, TraversalFinder& trav_finder, + pair, vector> re_genotype(const Snarl& snarl, + TraversalFinder& trav_finder, const vector& in_traversals, const vector& in_genotype, - int ploidy) const; + int ploidy, + const string& ref_path_name, + pair ref_interval) const; /// print a vcf variant void emit_variant(const Snarl& snarl, TraversalFinder& trav_finder, const vector& called_traversals, @@ -168,9 +175,9 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller { /// look up a path index for a site and return its name too pair find_index(const Snarl& snarl, const vector path_indexes) const; - /// get the position of a snarl from our reference path using the PathPositionHandleGraph interface + /// get the interval of a snarl from our reference path using the PathPositionHandleGraph interface /// the bool is true if the snarl's backward on the path - pair get_ref_position(const Snarl& snarl, const string& ref_path_name) const; + tuple get_ref_interval(const Snarl& snarl, const string& ref_path_name) const; /// clean up the alleles to not share common prefixes / suffixes void flatten_common_allele_ends(vcflib::Variant& variant, bool backward) const; diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index e13c66fb6e9..77393dce867 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -1,7 +1,7 @@ #include "snarl_caller.hpp" #include "genotypekit.hpp" -//#define debug +#define debug namespace vg { @@ -18,23 +18,28 @@ SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, S graph(graph), snarl_manager(snarl_manager), support_finder(support_finder) { + } SupportBasedSnarlCaller::~SupportBasedSnarlCaller() { } -void SupportBasedSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) { - // want to move away from ugly hacks that treat the reference traversal differently, - // so keep all these set the same - if (het_bias >= 0) { - max_het_bias = het_bias; - max_ref_het_bias = het_bias; - max_indel_het_bias = het_bias; - } - if (ref_het_bias >= 0) { - max_ref_het_bias = ref_het_bias; - } +void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const string& sample_name, + vcflib::Variant& variant) { + + +} + +const TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const { + return support_finder; +} + +int SupportBasedSnarlCaller::get_min_total_support_for_call() const { + return min_total_support_for_call; } void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support) { @@ -49,10 +54,45 @@ void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double m } } -vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, +int SupportBasedSnarlCaller::get_best_support(const vector& supports, const vector& skips) { + int best_allele = -1; + for(size_t i = 0; i < supports.size(); i++) { + if(std::find(skips.begin(), skips.end(), i) == skips.end() && ( + best_allele == -1 || support_val(supports[best_allele]) <= support_val(supports[i]))) { + best_allele = i; + } + } + return best_allele; +} + +RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder) : + SupportBasedSnarlCaller(graph, snarl_manager, support_finder) { +} + +RatioSupportSnarlCaller::~RatioSupportSnarlCaller() { + +} + +void RatioSupportSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) { + // want to move away from ugly hacks that treat the reference traversal differently, + // so keep all these set the same + if (het_bias >= 0) { + max_het_bias = het_bias; + max_ref_het_bias = het_bias; + max_indel_het_bias = het_bias; + } + if (ref_het_bias >= 0) { + max_ref_het_bias = ref_het_bias; + } +} + +vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, const vector& traversals, int ref_trav_idx, - int ploidy) { + int ploidy, + const string& ref_path_name, + pair ref_range) { #ifdef debug cerr << "Support calling site " << pb2json(snarl) << endl; @@ -62,7 +102,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, vector traversal_sizes = support_finder.get_traversal_sizes(traversals); // get the supports of each traversal independently - vector supports = support_finder.get_traversal_set_support(traversals, {}, false, false, ref_trav_idx); + vector supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx); int best_allele = get_best_support(supports, {}); #ifdef debug @@ -77,7 +117,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, // we prune out traversals whose exclusive support (structure that is not shared with best traversal) // doesn't meet a certain cutoff - vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, ref_trav_idx); + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx); vector skips = {best_allele}; for (int i = 0; i < secondary_exclusive_supports.size(); ++i) { double bias = get_bias(traversal_sizes, i, best_allele, ref_trav_idx); @@ -90,7 +130,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, } } // get the supports of each traversal in light of best - vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, ref_trav_idx); + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx); int second_best_allele = get_best_support(secondary_supports, {skips}); // get the supports of each traversal in light of second best @@ -99,7 +139,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, int third_best_allele = -1; if (second_best_allele != -1) { // prune out traversals whose exclusive support relative to second best doesn't pass cut - vector tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, ref_trav_idx); + vector tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, false, ref_trav_idx); skips.push_back(best_allele); skips.push_back(second_best_allele); for (int i = 0; i < tertiary_exclusive_supports.size(); ++i) { @@ -108,7 +148,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, skips.push_back(i); } } - tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, ref_trav_idx); + tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, false, ref_trav_idx); third_best_allele = get_best_support(tertiary_supports, skips); } @@ -151,7 +191,7 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, // Single ploidy case when doing recursive genotyping. Just return the best allele if (ploidy == 1) { - return {best_allele}; + return vector(1, best_allele); } // Call 1/2 : REF-Alt1/Alt2 even if Alt2 has only third best support else if (ploidy >= 2 && @@ -238,12 +278,14 @@ vector SupportBasedSnarlCaller::genotype(const Snarl& snarl, } + // Todo: specify call_info to use new interface, then fix up update_vcf_info to read it, + // and move common logic up to SupportBasedCaller if possible. return genotype; } -void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl, +void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, const vector& traversals, - const vector& genotype, + const vector& genotype, const string& sample_name, vcflib::Variant& variant) { @@ -255,11 +297,11 @@ void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl, shared_travs.push_back(genotype[0]); } // compute the support of our called alleles - vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, 0); + // todo: I think this undercounts support. shuold be fixed (as in Poisson version) + vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, false, 0); // get the support of our uncalled alleles, making sure to not include any called support - // TODO: handle shared support within this set - vector uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, 0); + vector uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, 0); // Set up the depth format field variant.format.push_back("DP"); @@ -353,7 +395,7 @@ void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl, } } -void SupportBasedSnarlCaller::update_vcf_header(string& header) const { +void RatioSupportSnarlCaller::update_vcf_header(string& header) const { header += "##INFO=\n"; header += "##FORMAT=\n"; header += "##FORMAT=\n"; @@ -368,18 +410,7 @@ void SupportBasedSnarlCaller::update_vcf_header(string& header) const { std::to_string(min_site_depth) + "\">\n"; } -int SupportBasedSnarlCaller::get_best_support(const vector& supports, const vector& skips) { - int best_allele = -1; - for(size_t i = 0; i < supports.size(); i++) { - if(std::find(skips.begin(), skips.end(), i) == skips.end() && ( - best_allele == -1 || support_val(supports[best_allele]) <= support_val(supports[i]))) { - best_allele = i; - } - } - return best_allele; -} - -function SupportBasedSnarlCaller::get_skip_allele_fn() const { +function RatioSupportSnarlCaller::get_skip_allele_fn() const { // port over cutoff used in old support caller (there avg support used all the time, here // we use the same toggles as when genotyping) return [&](const SnarlTraversal& trav) -> bool { @@ -387,17 +418,7 @@ function SupportBasedSnarlCaller::get_skip_allele_f }; } - -int SupportBasedSnarlCaller::get_min_total_support_for_call() const { - return min_total_support_for_call; -} - -const TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const { - return support_finder; -} - - -double SupportBasedSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, +double RatioSupportSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, int second_best_trav, int ref_trav_idx) const { bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx])); @@ -430,6 +451,315 @@ double SupportBasedSnarlCaller::get_bias(const vector& traversal_sizes, int } +PoissonSupportSnarlCaller::PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder, + const algorithms::BinnedDepthIndex& depth_index) : + SupportBasedSnarlCaller(graph, snarl_manager, support_finder), + depth_index(depth_index) { + +} + +PoissonSupportSnarlCaller::~PoissonSupportSnarlCaller() { + +} + +vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, + const vector& traversals, + int ref_trav_idx, + int ploidy, + const string& ref_path_name, + pair ref_range) { + + +#ifdef debug + cerr << "Poisson Support calling site " << pb2json(snarl) + << " on path " << ref_path_name << ":" << ref_range.first << "-" << ref_range.second << endl; +#endif + + assert(ploidy == 2 || ploidy == 1); + + // get the traversal sizes + vector traversal_sizes = support_finder.get_traversal_sizes(traversals); + + // get the supports of each traversal independently + vector supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx); + + // sort the traversals by support + vector ranked_traversals = rank_by_support(supports); + size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size()); + + // the candidate genotypes and their supports. the numbers here are alleles as indexed in traversals[] + map, vector> candidates; + + // consider each of the top 25 traversals as our top_traversal + for (int i = 0; i < max_trav; ++i) { + + int best_allele = ranked_traversals[i]; + + if (ploidy == 1) { + candidates[{best_allele}] = {supports[best_allele]}; + } else { + assert(ploidy == 2); + + // we prune out traversals whose exclusive support (structure that is not shared with best traversal) + // doesn't meet a certain cutoff + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx); + set skips = {best_allele}; + for (int j = 0; j < secondary_exclusive_supports.size(); ++j) { + if (j != best_allele && support_val(secondary_exclusive_supports[j]) <= min_total_support_for_call) { + skips.insert(j); + } + } + + // get the supports of each traversal in light of best + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx); + vector ranked_secondary_traversals = rank_by_support(secondary_supports); + + // add the homozygous genotype for our best allele + candidates[{best_allele, best_allele}] = {supports[best_allele], supports[best_allele]}; + + // now look at the top-k second-best traversals + size_t sec_count = 0; + for (int j = 0; j < ranked_secondary_traversals.size() && sec_count < top_k; ++j) { + int second_best_allele = ranked_secondary_traversals[j]; + if (!skips.count(second_best_allele) && second_best_allele != best_allele) { + // second best allele's support, sharing nodes with best + Support& second_best_support = secondary_supports[second_best_allele]; + // best allele's support, sharing nodes with second best + Support best_support_het = support_finder.get_traversal_set_support( + {traversals[best_allele], traversals[second_best_allele]}, + {1}, false, false, false, ref_trav_idx)[0]; + + // canonical ordering for our set + if (best_allele < second_best_allele) { + candidates[{best_allele, second_best_allele}] = {best_support_het, second_best_support}; + } else { + candidates[{second_best_allele, best_allele}] = {second_best_support, best_support_het}; + } + // also make sure we have our homozygous genotype for the second best allele + candidates[{second_best_allele, second_best_allele}] = {supports[second_best_allele], supports[second_best_allele]}; + ++sec_count; + } + } + } + } + + // expected depth from our coverage + const pair& start_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first); + const pair& end_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.second); + double exp_depth = (start_depth.first + end_depth.first) / 2.; + double depth_err = (start_depth.second + end_depth.second) / 2.; + assert(!isnan(exp_depth) && !isnan(depth_err)); + + // genotype (log) likelihoods + double best_genotype_likelihood = -numeric_limits::max(); + vector best_genotype; + for (const auto& candidate : candidates) { + double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err); + if (gl > best_genotype_likelihood) { + best_genotype_likelihood = gl; + best_genotype = candidate.first; + } + } + + return best_genotype; +} + +double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotype, + const vector& genotype_supports, + const vector& traversals, + int ref_trav_idx, double exp_depth, double depth_err) { + + assert(genotype_supports.size() == genotype.size()); + assert(genotype.size() == 1 || genotype.size() == 2); + + + // we need the support of all traversals *not* in the genotype. + Support total_other_support; + // we are running in a mode that will ignore stuff in our genotype, and only count the remainders once. + vector other_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, ref_trav_idx); + for (auto& other_support : other_supports) { + total_other_support += other_support; + } + + // split the homozygous support into two + // from now on we'll treat it like two separate observations, each with half coverage + vector fixed_genotype_supports = genotype_supports; + if (std::equal(genotype_supports.begin() + 1, genotype_supports.end(), genotype_supports.begin(), + [&](const Support& s1, const Support& s2) { return support_val(s1) == support_val(s2); })) { + for (int i = 0; i < genotype_supports.size(); ++i) { + fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size(); + } + } + + // total support of the site + Support total_site_support = total_other_support; + for (auto& support : fixed_genotype_supports) { + total_site_support += support; + } + + // how many reads would we expect to not map to our genotype due to error + double error_rate = std::min(0.95, depth_err + baseline_mapping_error); + double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support); + + // and our likelihood for the unmapped reads we see: + double other_log_likelihood = poisson_prob_ln(std::round(support_val(total_other_support)), other_poisson_lambda); + + // how many reads do we expect for an allele? we use the expected coverage and just + // divide it out by the size of the genotype. + double allele_poisson_lambda = (exp_depth / (double)genotype.size()) * (1. - error_rate); + +#ifdef debug + cerr << "Computing prob of genotype: {"; + for (int i = 0; i < genotype.size(); ++i) { + cerr << genotype[i] << ","; + } + cerr << "}: tot_other_sup = " << total_other_support << " tot site sup = " << total_site_support + << " exp-depth = " << exp_depth << " depth-err = " << depth_err << " other-lambda = " << other_poisson_lambda + << " allele-lambda " << allele_poisson_lambda << endl; +#endif + + // now we compute the likelihood of our genotype + double alleles_log_likelihood = 0; + for (int i = 0; i < fixed_genotype_supports.size(); ++i) { + double allele_ll = poisson_prob_ln(std::round(support_val(fixed_genotype_supports[i])), allele_poisson_lambda); + alleles_log_likelihood += allele_ll; + +#ifdef debug + cerr << " a[" << i <<"]=" << genotype[i] << " sup=" << genotype_supports[i] << " fix-sup=" << fixed_genotype_supports[i] + << " prob " << allele_ll << endl; +#endif + } + +#ifdef debug + cerr << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood + << " total-prob " << (alleles_log_likelihood + other_log_likelihood) << endl; +#endif + + return alleles_log_likelihood + other_log_likelihood; +} + +void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const string& sample_name, + vcflib::Variant& variant) { + + assert(traversals.size() == variant.alleles.size()); + + // Get the depth of the site + + // get the unique supports (useful only for getting a total) + vector unique_supports = support_finder.get_traversal_set_support(traversals, {}, false, true, true, 0); + Support site_support; + for (const Support& sup : unique_supports) { + site_support += sup; + } + double total_site_depth = support_val(site_support); + + // Set the variant's total depth + string depth_string = std::to_string((int64_t)round(total_site_depth)); + variant.format.push_back("DP"); + variant.info["DP"].push_back(depth_string); // We only have one sample, so variant depth = sample depth + + // And for the sample + variant.samples[sample_name]["DP"].push_back(depth_string); + + // get the allele depths + variant.format.push_back("AD"); + set called_allele_set(genotype.begin(), genotype.end()); + + for (int i = 0; i < traversals.size(); ++i) { + vector shared_travs; + bool in_genotype = called_allele_set.count(i); + if (in_genotype) { + // if we're in the genotype, then we share support with other alleles. + for (int a : called_allele_set) { + if (a != i) { + shared_travs.push_back(a); + } + } + } else { + // if we're not in the genotype, then we ignore support of everything in the genotype + shared_travs = genotype; + } + // we recompute all supports for each allele to get it's support relative to the genotype + // there is certainly room for optimization via remembering some of this stuff here + vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false); + variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i])))); + } + + // get the genotype likelihoods + // as above, there's some overlap in these computations as those used in genotype() to begin with + // this is an issue with the class interface which probably tries too hard to avoid being VCF-dependent + // but if it causes a slowdown (hasn't seemed to be a factor so far), the code could be re-organized + // to either store some of this information, or comptue the genotype and vcf fields in a single shot + variant.format.push_back("GL"); + + // expected depth from our coverage + pair ref_range = make_pair(variant.position, variant.position + variant.ref.length()); + const pair& start_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first); + const pair& end_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.second); + double exp_depth = (start_depth.first + end_depth.first) / 2.; + double depth_err = (start_depth.second + end_depth.second) / 2.; + assert(!isnan(exp_depth) && !isnan(depth_err)); + + // assume ploidy 2 + for (int i = 0; i < traversals.size(); ++i) { + for (int j = i; j < traversals.size(); ++j) { + vector genotype_supports; + if (i == j) { + // put the full support of allele for each copy of homozygous (genotype method expects this) + vector gt_supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false); + genotype_supports = {gt_supports[i], gt_supports[i]}; + } else { + // compute each support relative to the other + // todo: we can speed this up by saving above, or filtering down traversal list to just our genotype alleles + vector gt_supports = support_finder.get_traversal_set_support(traversals, {j}, false, false, false); + genotype_supports.push_back(gt_supports[i]); + gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false); + genotype_supports.push_back(gt_supports[j]); + } + double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err); + // convert from natural log to log10 by dividing by ln(10) + gl /= 2.30258; + variant.samples[sample_name]["GL"].push_back(std::to_string(gl)); + } + } + + // todo + /* + // Now do the filters + variant.filter = "PASS"; + if (min_site_support < min_mad_for_filter) { + // Apply Min Allele Depth cutoff across all alleles (even ref) + variant.filter = "lowad"; + } else if (min_ad_log_likelihood_for_filter != 0 && + ad_log_likelihood < min_ad_log_likelihood_for_filter) { + // We have a het, but the assignment of reads between the two branches is just too weird + variant.filter = "lowxadl"; + } else if ((int64_t)round(total(total_support)) < min_site_depth) { + // we don't have enough support to want to make a call + variant.filter = "lowdepth"; + } + */ +} + +void PoissonSupportSnarlCaller::update_vcf_header(string& header) const { + + +} + +vector PoissonSupportSnarlCaller::rank_by_support(const vector& supports) { + vector ranks(supports.size()); + for (int i = 0; i < supports.size(); ++i) { + ranks[i] = i; + } + std::sort(ranks.begin(), ranks.end(), [&](int a, int b) { + return support_val(supports[a]) > support_val(supports[b]); + }); + return ranks; +} } diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 40b4bab6784..1c3deace343 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -12,6 +12,7 @@ #include "snarls.hpp" #include "genotypekit.hpp" #include "traversal_support.hpp" +#include "algorithms/coverage_depth.hpp" namespace vg { @@ -27,11 +28,18 @@ class SnarlCaller { virtual ~SnarlCaller(); /// Get the genotype of a site + /// snarl : site + /// traversals : all traversals to consider + /// ref_trav_idx : index of reference path traversal in traversals (in case it needs special treatment) + /// ref_path : the reference path associated with the snarl + /// ref_range : the interval along the reference path (forward coordinates) spanned by snarl virtual vector genotype(const Snarl& snarl, const vector& traversals, int ref_trav_idx, - int ploidy) = 0; - + int ploidy, + const string& ref_path_name, + pair ref_range) = 0; + /// Update INFO and FORMAT fields of the called variant virtual void update_vcf_info(const Snarl& snarl, const vector& traversals, @@ -47,23 +55,81 @@ class SnarlCaller { }; /** - * Find the genotype of some traversals in a site using read support + * Interface for a caller that relies on a TraversalSupportFinder + * and has a few very basic support-based cutoffs + * Not every exciting but is currently required for the LegacySupportCaller + * which needs this to interface with the RepresentativeTraversalFinder */ class SupportBasedSnarlCaller : public SnarlCaller { public: SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, TraversalSupportFinder& support_finder); + virtual ~SupportBasedSnarlCaller(); + virtual void update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const string& sample_name, + vcflib::Variant& variant); + /// Set some of the parameters - void set_het_bias(double het_bias, double ref_het_bias = 0.); void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support); + + /// Get the traversal support finder + const TraversalSupportFinder& get_support_finder() const; + + /// Get the minimum total support for call + virtual int get_min_total_support_for_call() const; + +protected: + + /// Get the best support out of a list of supports, ignoring skips + static int get_best_support(const vector& supports, const vector& skips); + + /// Relic from old code + static double support_val(const Support& support) { return total(support); }; + + const PathHandleGraph& graph; + + SnarlManager& snarl_manager; + + /// Get support from traversals + TraversalSupportFinder& support_finder; + + /// What's the minimum integer number of reads that must support a call? We + /// don't necessarily want to call a SNP as het because we have a single + // supporting read, even if there are only 10 reads on the site. + int min_total_support_for_call = 1; + /// what's the minimum ref or alt allele depth to give a PASS in the filter + /// column? Also used as a min actual support for a second-best allele call + size_t min_mad_for_filter = 1; + /// what's the minimum total support (over all alleles) of the site to make + /// a call + size_t min_site_depth = 3; +}; + + +/** + * Find the genotype of some traversals in a site using read support and + * a bias ratio to tell heterozygous from homozygous + */ +class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { +public: + RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder); + virtual ~RatioSupportSnarlCaller(); + + /// Set some of the parameters + void set_het_bias(double het_bias, double ref_het_bias = 0.); /// Get the genotype of a site virtual vector genotype(const Snarl& snarl, const vector& traversals, int ref_trav_idx, - int ploidy); + int ploidy, + const string& ref_path_name, + pair ref_range); /// Update INFO and FORMAT fields of the called variant virtual void update_vcf_info(const Snarl& snarl, @@ -78,20 +144,8 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Use min_alt_path_support threshold as cutoff virtual function get_skip_allele_fn() const; - /// Get the minimum total support for call - virtual int get_min_total_support_for_call() const; - - /// Get the traversal support finder - const TraversalSupportFinder& get_support_finder() const; - protected: - /// Get the best support out of a list of supports, ignoring skips - static int get_best_support(const vector& supports, const vector& skips); - - /// Relic from old code - static double support_val(const Support& support) { return total(support); }; - /// Get the bias used to for comparing two traversals /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel /// see tuning parameters below) @@ -104,10 +158,6 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Tuning - /// What's the minimum integer number of reads that must support a call? We - /// don't necessarily want to call a SNP as het because we have a single - // supporting read, even if there are only 10 reads on the site. - int min_total_support_for_call = 1; /// What fraction of the reads supporting an alt are we willing to discount? /// At 2, if twice the reads support one allele as the other, we'll call /// homozygous instead of heterozygous. At infinity, every call will be @@ -120,26 +170,71 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Used for calling 1/2 calls. If both alts (times this bias) are greater than /// the reference, the call is made. set to 0 to deactivate. double max_ma_bias = 0; - /// what's the minimum ref or alt allele depth to give a PASS in the filter - /// column? Also used as a min actual support for a second-best allele call - size_t min_mad_for_filter = 1; - /// what's the minimum total support (over all alleles) of the site to make - /// a call - size_t min_site_depth = 3; /// what's the min log likelihood for allele depth assignments to PASS? double min_ad_log_likelihood_for_filter = -9; /// used only for pruning alleles in the VCFTraversalFinder: minimum support /// of an allele's alt-path for it to be considered in the brute-force enumeration double min_alt_path_support = 0.2; + +}; - const PathHandleGraph& graph; +/** + * Find the genotype of some traversals in a site using read support + * and a Poisson model based on expected depth. Inspired, in part, + * by Paragraph, which uses a similar approach for genotyping break points + * + **/ +class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { +public: + PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder, + const algorithms::BinnedDepthIndex& depth_index); + virtual ~PoissonSupportSnarlCaller(); + + /// Get the genotype of a site + virtual vector genotype(const Snarl& snarl, + const vector& traversals, + int ref_trav_idx, + int ploidy, + const string& ref_path_name, + pair ref_range); + + /// Update INFO and FORMAT fields of the called variant + virtual void update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const string& sample_name, + vcflib::Variant& variant); - SnarlManager& snarl_manager; + /// Define any header fields needed by the above + virtual void update_vcf_header(string& header) const; - TraversalSupportFinder& support_finder; +protected: + + /// Compute likelihood of genotype as product of poisson probabilities + /// P[allele1] * P[allle2] * P[uncalled alleles] + /// Homozygous alleles are split into two, with half support each + /// The (natural) logoarithm is returned + double genotype_likelihood(const vector& genotype, + const vector& genotype_supports, + const vector& traversals, + int ref_trav_idx, double exp_depth, double depth_err); + + /// Rank supports + vector rank_by_support(const vector& supports); + + /// Baseline mapping error rate (gets added to the standard error from coverage) + double baseline_mapping_error = 0.05; + + /// Consider up to the top-k traversals (based on support) for genotyping + size_t top_k = 25; + /// Map path name to of depth coverage from the packer + const algorithms::BinnedDepthIndex& depth_index; + }; + // debug helpers inline string to_string(const HandleGraph& graph, handle_t handle) { return std::to_string(graph.get_id(handle)) + ":" + std::to_string(graph.get_is_reverse(handle)); diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index c2ebac17dda..fbe049486a4 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -28,8 +28,9 @@ void help_call(char** argv) { << endl << "support calling options:" << endl << " -k, --pack FILE Supports created from vg pack for given input graph" << endl - << " -b, --het-bias M,N Homozygous alt/ref allele must have >= M/N times more support than the next best allele [default = 6,6]" << endl << " -m, --min-support M,N Minimum allele support (M) and minimum site support (N) for call [default = 1,4]" << endl + << " -B, --bias-mode Use old ratio-based genotyping algorithm as opposed to porbablistic model" << endl + << " -b, --het-bias M,N Homozygous alt/ref allele must have >= M/N times more support than the next best allele [default = 6,6]" << endl << "general options:" << endl << " -v, --vcf FILE VCF file to genotype (must have been used to construct input graph with -a)" << endl << " -f, --ref-fasta FILE Reference fasta (required if VCF contains symbolic deletions or inversions)" << endl @@ -55,6 +56,7 @@ int main_call(int argc, char** argv) { vector ref_path_lengths; string min_support_string; string bias_string; + bool ratio_caller = false; int c; optind = 2; // force optind past command positional argument @@ -62,6 +64,7 @@ int main_call(int argc, char** argv) { static const struct option long_options[] = { {"pack", required_argument, 0, 'k'}, + {"bias-mode", no_argument, 0, 'B'}, {"het-bias", required_argument, 0, 'b'}, {"min-support", required_argument, 0, 'm'}, {"vcf", required_argument, 0, 'v'}, @@ -79,7 +82,7 @@ int main_call(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "k:b:m:v:f:i:s:r:p:o:l:t:h", + c = getopt_long (argc, argv, "k:Bb:m:v:f:i:s:r:p:o:l:t:h", long_options, &option_index); // Detect the end of the options. @@ -91,6 +94,9 @@ int main_call(int argc, char** argv) { case 'k': pack_filename = optarg; break; + case 'B': + ratio_caller = true; + break; case 'b': bias_string = optarg; break; @@ -218,6 +224,11 @@ int main_call(int argc, char** argv) { cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl; return 1; } + // Check bias option + if (!bias_string.empty() && !ratio_caller) { + cerr << "error [vg call]: -b can only be used with -B" << endl; + return 1; + } // No paths specified: use them all if (ref_paths.empty()) { @@ -245,6 +256,7 @@ int main_call(int argc, char** argv) { unique_ptr graph_caller; unique_ptr snarl_caller; + algorithms::BinnedDepthIndex depth_index; // Make a Packed Support Caller unique_ptr packer; @@ -256,14 +268,27 @@ int main_call(int argc, char** argv) { // Make a packed traversal support finder PackedTraversalSupportFinder* packed_support_finder = new PackedTraversalSupportFinder(*packer, *snarl_manager); support_finder = unique_ptr(packed_support_finder); - // Make a support caller - SupportBasedSnarlCaller* packed_caller = new SupportBasedSnarlCaller(*graph, *snarl_manager, *packed_support_finder); - if (het_bias >= 0) { - packed_caller->set_het_bias(het_bias, ref_het_bias); + + SupportBasedSnarlCaller* packed_caller = nullptr; + + if (ratio_caller == false) { + // Make a depth index + depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 1000000, 0, true, true); + // Make a new-stype probablistic caller + auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index); + packed_caller = poisson_caller; + } else { + // Make an old-style ratio support caller + auto ratio_caller = new RatioSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder); + if (het_bias >= 0) { + ratio_caller->set_het_bias(het_bias, ref_het_bias); + } + packed_caller = ratio_caller; } if (min_allele_support >= 0) { packed_caller->set_min_supports(min_allele_support, min_allele_support, min_site_support); } + snarl_caller = unique_ptr(packed_caller); } diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp index 9372a995a73..e3fed43860a 100644 --- a/src/traversal_support.cpp +++ b/src/traversal_support.cpp @@ -60,15 +60,17 @@ tuple TraversalSupportFinder::get_child_support(const Sna Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& traversal) const { - return get_traversal_set_support({traversal}, {}, false, false).at(0); + return get_traversal_set_support({traversal}, {}, false, false, false).at(0); } vector TraversalSupportFinder::get_traversal_set_support(const vector& traversals, - const vector& shared_travs, - bool exclusive_only, - bool exclusive_count, - int ref_trav_idx) const { - + const vector& shared_travs, + bool exclusive_only, + bool exclusive_count, + bool unique, + int ref_trav_idx) const { + assert(!unique || (exclusive_count || exclusive_only)); + // pass 1: how many times have we seen a node or edge unordered_map node_counts; unordered_map edge_counts; @@ -169,12 +171,16 @@ vector TraversalSupportFinder::get_traversal_set_support(const vector 0 && visit_idx < trav.visit_size() - 1)) { @@ -188,7 +194,9 @@ vector TraversalSupportFinder::get_traversal_set_support(const vector #include @@ -52,10 +52,13 @@ class TraversalSupportFinder { /// exclusive_count is like exclusive only except shared traversals will be counted (as 0) /// when doing average and min support /// if the ref_trav_idx is given, it will be used for computing (deletion) edge lengths + /// if unique is true, then every node or edge will only be counted once + /// (useful for total support) virtual vector get_traversal_set_support(const vector& traversals, const vector& shared_travs, bool exclusive_only, bool exclusive_count, + bool unique, int ref_trav_idx = -1) const; /// Get the total length of all nodes in the traversal From e7c5bca5b0e661e46f87189399bd166e60968dc3 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 8 Nov 2019 16:24:01 -0500 Subject: [PATCH 31/79] bug fixes. more aggressive exclusive support filtering --- src/algorithms/coverage_depth.cpp | 5 ++++- src/graph_caller.cpp | 2 +- src/snarl_caller.cpp | 17 ++++++++++++++--- src/subcommand/call_main.cpp | 2 +- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index 0d63f656cb9..bd46c1aabc5 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -157,7 +157,10 @@ unordered_map>> binned_packed_depth_ind const pair& get_depth_from_index(const unordered_map>>& depth_index, const string& path_name, size_t offset) { - return depth_index.at(path_name).lower_bound(offset)->second; + + auto ub = depth_index.at(path_name).upper_bound(offset); + --ub; + return ub->second; } // draw (roughly) max_nodes nodes from the graph using the random seed diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 55e8d097759..22622e475e0 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -241,7 +241,7 @@ tuple VCFGenotyper::get_ref_positions(const vector> path_offsets; for (const vcflib::Variant* var : variants) { if (path_offsets.count(var->sequenceName)) { - pair& record = path_offsets[var->ref]; + pair& record = path_offsets[var->sequenceName]; record.first = std::min((size_t)var->position, record.first); record.second = std::max((size_t)var->position + var->ref.length(), record.second); } else { diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 77393dce867..5dfba9dee81 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -491,11 +491,18 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, // the candidate genotypes and their supports. the numbers here are alleles as indexed in traversals[] map, vector> candidates; + // pre-filter out some alleles based on poor exclusive support + set skips; + // consider each of the top 25 traversals as our top_traversal for (int i = 0; i < max_trav; ++i) { int best_allele = ranked_traversals[i]; + if (skips.count(best_allele)) { + continue; + } + if (ploidy == 1) { candidates[{best_allele}] = {supports[best_allele]}; } else { @@ -504,7 +511,6 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, // we prune out traversals whose exclusive support (structure that is not shared with best traversal) // doesn't meet a certain cutoff vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx); - set skips = {best_allele}; for (int j = 0; j < secondary_exclusive_supports.size(); ++j) { if (j != best_allele && support_val(secondary_exclusive_supports[j]) <= min_total_support_for_call) { skips.insert(j); @@ -562,6 +568,9 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, } } +#ifdef debug + cerr << " best genotype: "; for (auto a : best_genotype) {cerr << a <<",";} cerr << " gl=" << best_genotype_likelihood << endl; +#endif return best_genotype; } @@ -746,8 +755,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } void PoissonSupportSnarlCaller::update_vcf_header(string& header) const { - - + header += "##INFO=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; } vector PoissonSupportSnarlCaller::rank_by_support(const vector& supports) { diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index fbe049486a4..3f69185b0c9 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -273,7 +273,7 @@ int main_call(int argc, char** argv) { if (ratio_caller == false) { // Make a depth index - depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 1000000, 0, true, true); + depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true); // Make a new-stype probablistic caller auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index); packed_caller = poisson_caller; From faa15ecaceeccecdee261698936a093d580cb7d7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 8 Nov 2019 16:57:50 -0500 Subject: [PATCH 32/79] another exclusive support bug --- src/snarl_caller.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 5dfba9dee81..031e379dee1 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -512,7 +512,9 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, // doesn't meet a certain cutoff vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx); for (int j = 0; j < secondary_exclusive_supports.size(); ++j) { - if (j != best_allele && support_val(secondary_exclusive_supports[j]) <= min_total_support_for_call) { + if (j != best_allele && + support_val(secondary_exclusive_supports[j]) < min_total_support_for_call && + support_val(secondary_exclusive_supports[j]) < support_val(supports[j])) { skips.insert(j); } } From d2705eb3a15a3a47df86b598c546d914c310de1c Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Fri, 8 Nov 2019 14:38:13 -0800 Subject: [PATCH 33/79] Passed random unit tests --- src/seed_clusterer.cpp | 52 +++++++++++++++------------------ src/seed_clusterer.hpp | 23 +++++++-------- src/unittest/seed_clusterer.cpp | 3 +- 3 files changed, 35 insertions(+), 43 deletions(-) diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index 31fe6e6b7b7..f0119d2cd34 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -2,7 +2,7 @@ #include -#define DEBUG_CLUSTER +//#define DEBUG_CLUSTER namespace vg { @@ -12,14 +12,14 @@ namespace vg { SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (vector seeds, int64_t read_distance_limit) const { vector> all_seeds; - all_seeds.push_back(std::move(seeds)); - tuple>>,vector>> all_clusters = + all_seeds.push_back(seeds); + tuple,SnarlSeedClusterer::cluster_group_t> all_clusters = cluster_seeds(all_seeds, read_distance_limit, 0); return std::get<0>(all_clusters)[0]; }; tuple,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds ( - vector> all_seeds, int64_t read_distance_limit, + vector>& all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit) const { /* Given a vector of seeds and a limit, find a clustering of seeds where * seeds that are closer than the limit cluster together. @@ -122,7 +122,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { cerr << endl; } #endif - vector>> read_clusters; + vector read_clusters; for (auto& uf : tree_state.read_union_find) { read_clusters.emplace_back(uf.all_groups()); } @@ -320,7 +320,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) { size_t group_id = seed_range_start->second; - if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { //For each seed on this node, add it to the cluster @@ -344,6 +343,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { tree_state.read_union_find[read_num].union_groups(group_id, iter->second); if (tree_state.fragment_distance_limit != 0 ) { + if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; tree_state.fragment_union_find.union_groups(fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]); } @@ -444,8 +444,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { size_t read_num = std::get<0>(s); if (read_last_offset[read_num] != -1 && - abs(std::get<2>(s) - read_last_offset[read_num]) <= tree_state.read_distance_limit) { - //TODO: Need abs? + std::get<2>(s) - read_last_offset[read_num] <= tree_state.read_distance_limit) { //If this seed is in the same read cluster as the previous one, //union them @@ -474,7 +473,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { make_pair(read_last_offset[read_num], node_length - read_last_offset[read_num] + 1); if (tree_state.fragment_distance_limit != 0) { if (fragment_last_offset != -1 && - abs(std::get<2>(s) - fragment_last_offset) <= tree_state.fragment_distance_limit) { + std::get<2>(s) - fragment_last_offset <= tree_state.fragment_distance_limit) { //If this is a new read cluster but the same fragment cluster tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster); fragment_last_cluster = tree_state.fragment_union_find.find_group(fragment_last_cluster); @@ -619,7 +618,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { fragment_combined_group = tree_state.fragment_union_find.find_group( new_group + tree_state.read_index_offsets[read_num]); } - cerr << endl; return; }; //The clusters of the chain that are built from the snarl clusters @@ -1331,7 +1329,6 @@ cerr << " Combining this cluster from the right" << endl; pair dists_c = tree_state.read_cluster_dists[child_cluster_head.first][child_cluster_head.second]; old_dists[child_cluster_head] = dists_c; - //TODO: Do this only once pair new_dists = snarl_index.distToEnds(node_rank, dists_c.first,dists_c.second); #ifdef DEBUG_CLUSTER @@ -1431,35 +1428,35 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank //from the left of both of them int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 : dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1; + int64_t fragment_dist = dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1; combine_clusters(c_group, group_l_l[read_num], fragment_group_l_l, - dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1, - read_dist, read_num); + fragment_dist, read_dist, read_num); } if (dist_l_r != -1 && dists_c.first != -1 && other_node_clusters.fragment_best_right != -1 ) { //If it can be combined from the left to the right of j + int64_t fragment_dist = dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1; int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 : dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1; combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r, - dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1, - read_dist, read_num); + fragment_dist, read_dist, read_num); } if (dist_r_l != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_left != -1 ) { + int64_t fragment_dist = dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1; int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 : dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1; combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l, - dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1, - read_dist, read_num); + fragment_dist, read_dist, read_num); } if (dist_r_r != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_right != -1 ) { + int64_t fragment_dist = dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1; int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 : dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1; combine_clusters(c_group, group_r_r[read_num], fragment_group_r_r, - dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1, - read_dist, read_num); + fragment_dist, read_dist, read_num); } } @@ -1484,39 +1481,38 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1 && dists_k.first != -1 ){ + int64_t fragment_dist = dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1; int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 : dist_l_l + curr_child_clusters.read_best_left[read_num] + dists_k.first-1; combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l, - dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1, - read_dist, read_num); + fragment_dist,read_dist, read_num); } if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1 && dists_k.second != -1 ) { int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 : dist_l_r + curr_child_clusters.read_best_left[read_num] + dists_k.second-1; - + int64_t fragment_dist = dist_l_r + curr_child_clusters.fragment_best_left + dists_k.second-1; combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r, - dist_l_r + curr_child_clusters.fragment_best_left + dists_k.second-1, - read_dist, read_num); + fragment_dist, read_dist, read_num); } if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1 && dists_k.first != -1 ) { + int64_t fragment_dist = dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1; int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 : dist_r_l + curr_child_clusters.read_best_right[read_num] + dists_k.first-1; combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l, - dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1, - read_dist, read_num); + fragment_dist, read_dist, read_num); } if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1 && dists_k.second != -1 ) { + int64_t fragment_dist = dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1; int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 : dist_r_r + curr_child_clusters.read_best_right[read_num] + dists_k.second-1; combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r, - dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1, - read_dist, read_num); + fragment_dist, read_dist, read_num); } } } diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp index f95f0261ae9..92c6b604f46 100644 --- a/src/seed_clusterer.hpp +++ b/src/seed_clusterer.hpp @@ -14,27 +14,26 @@ class SnarlSeedClusterer { SnarlSeedClusterer(MinimumDistanceIndex& dist_index); + //Represents all clusters for one vector of seeds + //Each cluster is a vector of indexes into the vector of seeds typedef vector> cluster_group_t; ///Given a vector of seeds (pos_t) and a distance limit, //cluster the seeds such that two seeds whose minimum distance //between them (including both of the positions) is less than // the distance limit are in the same cluster - // - //Returns a vector of clusters. Each cluster is a vector of - //indices into seeds cluster_group_t cluster_seeds ( vector seeds, int64_t read_distance_limit) const; ///The same thing, but for paired end reads. - //Given seeds from multiple reads of a fragment, cluster each set of seeds - //by the read distance and all seeds by the fragment distance limit + //Given seeds from multiple reads of a fragment, cluster each read + //by the read distance and all seeds by the fragment distance limit. //fragment_distance_limit must be greater than read_distance_limit //Returns clusters for each read and clusters of all the seeds in all reads //The read clusters refer to seeds by their indexes in the input vectors of seeds //The fragment clusters give seeds the index they would get if the vectors of // seeds were appended to each other in the order given tuple, cluster_group_t> cluster_seeds ( - vector> all_seeds, + vector>& all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit=0) const; private: @@ -137,7 +136,8 @@ class SnarlSeedClusterer { //Vector of all the seeds for each read vector>* all_seeds; - //Vector of the offset of indices for each seed + //prefix sum vector of the number of seeds per read + //To get the index of a seed for the fragment clusters vector read_index_offsets; //The minimum distance between nodes for them to be put in the @@ -191,7 +191,8 @@ class SnarlSeedClusterer { hash_map>> parent_snarl_to_nodes; - //Constructor takes in a pointer to the seeds and the distance limit + //Constructor takes in a pointer to the seeds, the distance limits, and + //the total number of seeds in all_seeds TreeState (vector>* all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit, size_t seed_count) : all_seeds(all_seeds), @@ -200,9 +201,7 @@ class SnarlSeedClusterer { fragment_union_find (seed_count, false), read_index_offsets(1,0){ - size_t total_seeds = 0; for (vector& v : *all_seeds) { - total_seeds += v.size(); size_t offset = read_index_offsets.back() + v.size(); read_index_offsets.push_back(offset); read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1)); @@ -231,8 +230,7 @@ class SnarlSeedClusterer { //Cluster all the chains at the current level void cluster_chains(TreeState& tree_state, size_t depth) const; - //Given a node and the indices of seeds on that node, root, - //cluster the seeds + //Cluster the seeds on the specified node NodeClusters cluster_one_node(TreeState& tree_state, id_t node_id, int64_t node_length) const; @@ -243,7 +241,6 @@ class SnarlSeedClusterer { //Cluster the seeds in a snarl given by snarl_index_i, an index into //dist_index.snarl_indexes - //rev is true if this snarl is reversed in its parent NodeClusters cluster_one_snarl(TreeState& tree_state, size_t snarl_index_i) const; diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp index 8e107e8e99d..e6fec99233c 100644 --- a/src/unittest/seed_clusterer.cpp +++ b/src/unittest/seed_clusterer.cpp @@ -845,7 +845,7 @@ namespace unittest { uniform_int_distribution randSnarlIndex(0, allSnarls.size()-1); default_random_engine generator(time(NULL)); - for (size_t k = 0; k < 100 ; k++) { + for (size_t k = 0; k < 1000 ; k++) { vector> all_seeds; all_seeds.emplace_back(); all_seeds.emplace_back(); @@ -892,7 +892,6 @@ namespace unittest { for (size_t a = 0; a < one_read_clusters.size(); a++) { // For each cluster -cluster this cluster to ensure that // there is only one - cerr << a << " of " << one_read_clusters.size() << endl; vector clust = one_read_clusters[a]; structures::UnionFind new_clusters (clust.size(), false); From 820ad0fd174e56c648173a1cc70823e4543feb54 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 8 Nov 2019 21:54:28 -0500 Subject: [PATCH 34/79] fix bugs to get bash tap tests going --- src/algorithms/coverage_depth.cpp | 2 +- src/snarl_caller.cpp | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index bd46c1aabc5..6d28335f4d7 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -146,7 +146,7 @@ unordered_map>> binned_packed_depth_ind double var = get<3>(binned_depth); // optionally convert variance to standard error if (std_err) { - var = sqrt(var / (double)(get<1>(binned_depth) - get<2>(binned_depth))); + var = sqrt(var / (double)(get<1>(binned_depth) - get<0>(binned_depth))); } depth_map[get<0>(binned_depth)] = make_pair(get<2>(binned_depth), var); } diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 031e379dee1..5ca35c7c2a0 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -610,7 +610,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp } // how many reads would we expect to not map to our genotype due to error - double error_rate = std::min(0.95, depth_err + baseline_mapping_error); + double error_rate = std::min(0.25, depth_err + baseline_mapping_error); double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support); // and our likelihood for the unmapped reads we see: @@ -733,8 +733,13 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err); // convert from natural log to log10 by dividing by ln(10) - gl /= 2.30258; - variant.samples[sample_name]["GL"].push_back(std::to_string(gl)); + variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); + + // use our likelihood as the VCF quality + // todo: check if there's something more conventional to use + if ((genotype[0] == i && genotype[1] == j) || (genotype[0] == j && genotype[1] == i)) { + variant.quality = logprob_to_phred(gl); + } } } From 1eeb6af52bf2199f1925cc47a5cdbd8339c74c2c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 8 Nov 2019 21:58:46 -0500 Subject: [PATCH 35/79] turn off debug output --- src/snarl_caller.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 5ca35c7c2a0..d22171ba617 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -1,7 +1,7 @@ #include "snarl_caller.hpp" #include "genotypekit.hpp" -#define debug +//#define debug namespace vg { From d7fb34e700ab63478b3a6581dfbfbef79340968c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 11 Nov 2019 16:14:42 -0500 Subject: [PATCH 36/79] add function for ewens sampling probability --- src/distributions.hpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/distributions.hpp b/src/distributions.hpp index fb5a4b3cdb4..6543cacc0db 100644 --- a/src/distributions.hpp +++ b/src/distributions.hpp @@ -785,6 +785,38 @@ class discrete_distribution { }; +// ewen's allele sampling distribution. for use in genotype prior (as in freebayes) +// gives Pr(a1, ...,an;theta) where ai is the number of sampled haplotypes (out of n) that +// have i different alleles at a given locus. theta is the population mutation rate. +// ex: for a single diploid genotype, a={2,0} = heterozygous: 2 alleles occur once. +// a={0,1} = homozygous: 1 allele occurs twice. +// +// https://en.wikipedia.org/wiki/Ewens%27s_sampling_formula +// https://github.com/ekg/freebayes/blob/master/src/Ewens.cpp#L17 +inline real_t ewens_af_prob_ln(const vector& a, real_t theta) { + + // first term (wrt formula as stated on wikipedia) + // n! / (theta * (theta + 1) * ... (theta + n - 1)) + real_t term1_num_ln = factorial_ln(a.size()); + real_t term1_denom_ln = 0.; + for (int i = 0; i < a.size(); ++i) { + term1_denom_ln += log(theta + i); + } + real_t term1_ln = term1_num_ln - term1_denom_ln; + + // second term + // prod [ (theta^aj) / (j^aj * aj!) + real_t term2_ln = 0.; + for (int j = 0; j < a.size(); ++j) { + real_t num = log(pow(theta, a[j])); + real_t denom = log(pow(1. + j, a[j]) + factorial_ln(a[j])); + term2_ln += num - denom; + } + + return term1_ln + term2_ln; +} + + } #endif From fd5b96731d4502817366baef5e3c5555c1063899 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 11 Nov 2019 16:17:20 -0500 Subject: [PATCH 37/79] let poisson caller use min-ad and het-bias from old caller for now (hopefully temporary) --- src/snarl_caller.cpp | 131 +++++++++++++++++++++-------------- src/snarl_caller.hpp | 38 +++++----- src/subcommand/call_main.cpp | 13 ++-- 3 files changed, 103 insertions(+), 79 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index d22171ba617..475b6454666 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -54,6 +54,19 @@ void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double m } } +void SupportBasedSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) { + // want to move away from ugly hacks that treat the reference traversal differently, + // so keep all these set the same + if (het_bias >= 0) { + max_het_bias = het_bias; + max_ref_het_bias = het_bias; + max_indel_het_bias = het_bias; + } + if (ref_het_bias >= 0) { + max_ref_het_bias = ref_het_bias; + } +} + int SupportBasedSnarlCaller::get_best_support(const vector& supports, const vector& skips) { int best_allele = -1; for(size_t i = 0; i < supports.size(); i++) { @@ -65,6 +78,38 @@ int SupportBasedSnarlCaller::get_best_support(const vector& supports, c return best_allele; } +double SupportBasedSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, + int second_best_trav, int ref_trav_idx) const { + bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || + (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx])); + + double bias_limit = 1; + + if (best_trav >= 0 && second_best_trav >=0) { + if (best_trav == ref_trav_idx) { + // Use ref bias limit + + // We decide closeness differently depending on whether best is ref + // or not. In practice, we use this to slightly penalize homozygous + // ref calls (by setting max_ref_het_bias higher than max_het_bias) + // and rather make a less supported alt call instead. This boost + // max sensitivity, and because everything is homozygous ref by + // default in VCF, any downstream filters will effectively reset + // these calls back to homozygous ref. TODO: This shouldn't apply + // when off the primary path! + bias_limit = max_ref_het_bias; + } else if (is_indel) { + // This is an indel + // Use indel bias limit + bias_limit = max_indel_het_bias; + } else { + // Use normal het bias limit + bias_limit = max_het_bias; + } + } + return bias_limit; +} + RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, TraversalSupportFinder& support_finder) : SupportBasedSnarlCaller(graph, snarl_manager, support_finder) { @@ -74,19 +119,6 @@ RatioSupportSnarlCaller::~RatioSupportSnarlCaller() { } -void RatioSupportSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) { - // want to move away from ugly hacks that treat the reference traversal differently, - // so keep all these set the same - if (het_bias >= 0) { - max_het_bias = het_bias; - max_ref_het_bias = het_bias; - max_indel_het_bias = het_bias; - } - if (ref_het_bias >= 0) { - max_ref_het_bias = ref_het_bias; - } -} - vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, const vector& traversals, int ref_trav_idx, @@ -418,39 +450,6 @@ function RatioSupportSnarlCaller::get_skip_allele_f }; } -double RatioSupportSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, - int second_best_trav, int ref_trav_idx) const { - bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || - (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx])); - - double bias_limit = 1; - - if (best_trav >= 0 && second_best_trav >=0) { - if (best_trav == ref_trav_idx) { - // Use ref bias limit - - // We decide closeness differently depending on whether best is ref - // or not. In practice, we use this to slightly penalize homozygous - // ref calls (by setting max_ref_het_bias higher than max_het_bias) - // and rather make a less supported alt call instead. This boost - // max sensitivity, and because everything is homozygous ref by - // default in VCF, any downstream filters will effectively reset - // these calls back to homozygous ref. TODO: This shouldn't apply - // when off the primary path! - bias_limit = max_ref_het_bias; - } else if (is_indel) { - // This is an indel - // Use indel bias limit - bias_limit = max_indel_het_bias; - } else { - // Use normal het bias limit - bias_limit = max_het_bias; - } - } - return bias_limit; -} - - PoissonSupportSnarlCaller::PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, TraversalSupportFinder& support_finder, const algorithms::BinnedDepthIndex& depth_index) : @@ -563,7 +562,7 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, double best_genotype_likelihood = -numeric_limits::max(); vector best_genotype; for (const auto& candidate : candidates) { - double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err); + double gl = genotype_likelihood(candidate.first, candidate.second, traversals, traversal_sizes, ref_trav_idx, exp_depth, depth_err); if (gl > best_genotype_likelihood) { best_genotype_likelihood = gl; best_genotype = candidate.first; @@ -579,6 +578,7 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotype, const vector& genotype_supports, const vector& traversals, + const vector& traversal_sizes, int ref_trav_idx, double exp_depth, double depth_err) { assert(genotype_supports.size() == genotype.size()); @@ -602,6 +602,24 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size(); } } + + // we preserve our het-bias from RatioSupportSnarlCaller as something prior-like here + // todo: use something better + double het_prior = 0.; + double ew_prior = 0.; + if (genotype.size() == 2) { + double het_bias = get_bias(traversal_sizes, genotype[0], genotype[1], ref_trav_idx); + if (genotype[0] != genotype[1]) { + // if the het_bias is greater than 1 (usually it's 6 by default), then + // we get a prior of 5/6 for a het + het_prior = log(1. - 1. / het_bias); + ew_prior = ewens_af_prob_ln({2, 0}, 0.001); + } else { + // and 1/6 for a hom + het_prior = log(1. / het_bias); + ew_prior = ewens_af_prob_ln({0, 1}, 0.001); + } + } // total support of the site Support total_site_support = total_other_support; @@ -643,11 +661,12 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp } #ifdef debug - cerr << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood - << " total-prob " << (alleles_log_likelihood + other_log_likelihood) << endl; + cerr << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood << " prior " << het_prior + << " ew prior " << ew_prior + << " total-prob " << (alleles_log_likelihood + other_log_likelihood + het_prior) << endl; #endif - return alleles_log_likelihood + other_log_likelihood; + return alleles_log_likelihood + other_log_likelihood + het_prior; } void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, @@ -668,6 +687,8 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } double total_site_depth = support_val(site_support); + vector traversal_sizes = support_finder.get_traversal_sizes(traversals); + // Set the variant's total depth string depth_string = std::to_string((int64_t)round(total_site_depth)); variant.format.push_back("DP"); @@ -679,6 +700,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // get the allele depths variant.format.push_back("AD"); set called_allele_set(genotype.begin(), genotype.end()); + double min_site_support = genotype.size() > 0 ? INFINITY : 0; for (int i = 0; i < traversals.size(); ++i) { vector shared_travs; @@ -698,6 +720,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // there is certainly room for optimization via remembering some of this stuff here vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false); variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i])))); + if (in_genotype) { + // update the minimum support + min_site_support = min(min_site_support, total(allele_supports[i])); + } } // get the genotype likelihoods @@ -731,7 +757,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false); genotype_supports.push_back(gt_supports[j]); } - double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err); + double gl = genotype_likelihood({i, j}, genotype_supports, traversals, traversal_sizes, 0, exp_depth, depth_err); // convert from natural log to log10 by dividing by ln(10) variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); @@ -743,6 +769,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } } + // use old quality for now + variant.quality = min_site_support; + // todo /* // Now do the filters diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 1c3deace343..e2d7aa78f5c 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -75,6 +75,8 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Set some of the parameters void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support); + + void set_het_bias(double het_bias, double ref_het_bias = 0.); /// Get the traversal support finder const TraversalSupportFinder& get_support_finder() const; @@ -90,6 +92,12 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Relic from old code static double support_val(const Support& support) { return total(support); }; + /// Get the bias used to for comparing two traversals + /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel + /// see tuning parameters below) + double get_bias(const vector& traversal_sizes, int best_trav, + int second_best_trav, int ref_trav_idx) const; + const PathHandleGraph& graph; SnarlManager& snarl_manager; @@ -107,6 +115,15 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// what's the minimum total support (over all alleles) of the site to make /// a call size_t min_site_depth = 3; + /// What fraction of the reads supporting an alt are we willing to discount? + /// At 2, if twice the reads support one allele as the other, we'll call + /// homozygous instead of heterozygous. At infinity, every call will be + /// heterozygous if even one read supports each allele. + double max_het_bias = 6; + /// Like above, but applied to ref / alt ratio (instead of alt / ref) + double max_ref_het_bias = 6; + /// Like the max het bias, but applies to novel indels. + double max_indel_het_bias = 6; }; @@ -120,9 +137,6 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { TraversalSupportFinder& support_finder); virtual ~RatioSupportSnarlCaller(); - /// Set some of the parameters - void set_het_bias(double het_bias, double ref_het_bias = 0.); - /// Get the genotype of a site virtual vector genotype(const Snarl& snarl, const vector& traversals, @@ -146,27 +160,12 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { protected: - /// Get the bias used to for comparing two traversals - /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel - /// see tuning parameters below) - double get_bias(const vector& traversal_sizes, int best_trav, - int second_best_trav, int ref_trav_idx) const; - /// get a map of the beginning of a node (in forward orientation) on a traversal /// used for up-weighting large deletion edges in complex snarls with average support unordered_map get_ref_offsets(const SnarlTraversal& ref_trav) const; /// Tuning - /// What fraction of the reads supporting an alt are we willing to discount? - /// At 2, if twice the reads support one allele as the other, we'll call - /// homozygous instead of heterozygous. At infinity, every call will be - /// heterozygous if even one read supports each allele. - double max_het_bias = 6; - /// Like above, but applied to ref / alt ratio (instead of alt / ref) - double max_ref_het_bias = 6; - /// Like the max het bias, but applies to novel indels. - double max_indel_het_bias = 6; /// Used for calling 1/2 calls. If both alts (times this bias) are greater than /// the reference, the call is made. set to 0 to deactivate. double max_ma_bias = 0; @@ -218,13 +217,14 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { double genotype_likelihood(const vector& genotype, const vector& genotype_supports, const vector& traversals, + const vector& traversal_sizes, int ref_trav_idx, double exp_depth, double depth_err); /// Rank supports vector rank_by_support(const vector& supports); /// Baseline mapping error rate (gets added to the standard error from coverage) - double baseline_mapping_error = 0.05; + double baseline_mapping_error = 0.005; /// Consider up to the top-k traversals (based on support) for genotyping size_t top_k = 25; diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index 3f69185b0c9..4a611c67303 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -224,11 +224,6 @@ int main_call(int argc, char** argv) { cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl; return 1; } - // Check bias option - if (!bias_string.empty() && !ratio_caller) { - cerr << "error [vg call]: -b can only be used with -B" << endl; - return 1; - } // No paths specified: use them all if (ref_paths.empty()) { @@ -273,21 +268,21 @@ int main_call(int argc, char** argv) { if (ratio_caller == false) { // Make a depth index - depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true); + depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 50000, 0, true, true); // Make a new-stype probablistic caller auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index); packed_caller = poisson_caller; } else { // Make an old-style ratio support caller auto ratio_caller = new RatioSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder); - if (het_bias >= 0) { - ratio_caller->set_het_bias(het_bias, ref_het_bias); - } packed_caller = ratio_caller; } if (min_allele_support >= 0) { packed_caller->set_min_supports(min_allele_support, min_allele_support, min_site_support); } + if (het_bias >= 0) { + packed_caller->set_het_bias(het_bias, ref_het_bias); + } snarl_caller = unique_ptr(packed_caller); } From ed7538a438da0b196482581d60c7e9e84078a1c8 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Mon, 11 Nov 2019 14:01:06 -0800 Subject: [PATCH 38/79] Moved combining chain clusters to a helper --- src/seed_clusterer.cpp | 301 +++++++++++++++++++---------------------- src/seed_clusterer.hpp | 4 +- 2 files changed, 144 insertions(+), 161 deletions(-) diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index f0119d2cd34..dd16f19c8c5 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -2,7 +2,7 @@ #include -//#define DEBUG_CLUSTER +#define DEBUG_CLUSTER namespace vg { @@ -84,10 +84,10 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //Cluster all the snarls at this depth //Also records which snarls are in chains and the parents of these //snarls in tree_state.parent_snarl_to_node - cluster_snarls(tree_state, depth); + cluster_snarl_level(tree_state, depth); //And cluster all the chains, record the parents of these chains - cluster_chains(tree_state, depth); + cluster_chain_level(tree_state, depth); // Swap buffer over for the next level tree_state.snarl_to_nodes = move(tree_state.parent_snarl_to_nodes); @@ -138,36 +138,33 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { void SnarlSeedClusterer::get_nodes( TreeState& tree_state, vector>>>& - snarl_to_nodes_by_level) const { + snarl_to_nodes_by_level) const { // Assign each seed to a node. + hash_set seen_nodes; for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ vector& seeds = tree_state.all_seeds->at(read_num); for (size_t i = 0; i < seeds.size(); i++) { id_t id = get_id(seeds.at(i)); + + //Assign the seed to a node tree_state.node_to_seeds[read_num].emplace_back(id, i); - //For each seed, assign it to a node and the node to a snarl - } - std::sort(tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end()); - } - // Assign each node to a snarl. - hash_set seen_nodes; - for (auto& read_node :tree_state.node_to_seeds) { - for (auto& mapping : read_node) { - if (seen_nodes.count(mapping.first) < 1) { - seen_nodes.insert( mapping.first); - size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first); + //And the node to a snarl + if (seen_nodes.count(id) < 1) { + seen_nodes.insert(id); + size_t snarl_i = dist_index.getPrimaryAssignment(id); size_t depth = dist_index.snarl_indexes[snarl_i].depth; snarl_to_nodes_by_level[depth][snarl_i].emplace_back( - NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size())); + NetgraphNode(id, NODE), NodeClusters(tree_state.all_seeds->size())); } } + std::sort(tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end()); } } - void SnarlSeedClusterer::cluster_snarls(TreeState& tree_state, size_t depth) const { + void SnarlSeedClusterer::cluster_snarl_level(TreeState& tree_state, size_t depth) const { for (auto& kv : tree_state.snarl_to_nodes){ //Go through each of the snarls at this level, cluster them, @@ -192,14 +189,11 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //If this snarl is in a chain, cluster and add let the //tree state know which chain it belongs to - size_t chain_assignment = dist_index.getChainAssignment( - snarl_index.parent_id); - size_t chain_rank = dist_index.getChainRank( - snarl_index.id_in_parent); + size_t chain_assignment = dist_index.getChainAssignment(snarl_index.parent_id); + size_t chain_rank = dist_index.getChainRank(snarl_index.id_in_parent); tree_state.chain_to_snarls[chain_assignment].emplace( - chain_rank, make_pair(snarl_i, - cluster_one_snarl(tree_state, snarl_i))); + chain_rank, make_pair(snarl_i, cluster_one_snarl(tree_state, snarl_i))); #ifdef DEBUG_CLUSTER cerr << "Recording snarl number " << snarl_i << " headed by " @@ -243,7 +237,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { } } - void SnarlSeedClusterer::cluster_chains(TreeState& tree_state, size_t depth) const { + void SnarlSeedClusterer::cluster_chain_level(TreeState& tree_state, size_t depth) const { for (auto& kv : tree_state.chain_to_snarls) { //For each chain at this level that has relevant child snarls in it, //find the clusters. @@ -274,7 +268,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { #endif // Map it to the snarl number that should be represented by it // (and thus also contain the chain) - size_t parent_snarl_i =dist_index.getPrimaryAssignment(parent_id); + size_t parent_snarl_i = dist_index.getPrimaryAssignment(parent_id); // Register clusters as relevant for that parent snarl. @@ -299,12 +293,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { cerr << "Finding clusters on node " << node_id << " which has length " << node_length << endl; #endif - /*Find clusters of seeds in this node. - * Result contains hash_set of the union find group IDs of the new clusters, - * and the shortest distance from any seed to the left and right sides - * of the node*/ - //indices of union find group ids of clusters in this node + //Final clusters on the node that we will be returning NodeClusters node_clusters(tree_state.all_seeds->size()); if (tree_state.read_distance_limit > node_length) { @@ -314,14 +304,15 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { size_t fragment_group_id = -1; for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { auto seed_range_start = std::lower_bound( - tree_state.node_to_seeds[read_num].begin(), - tree_state.node_to_seeds[read_num].end(), + tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end(), std::pair(node_id, 0)); - if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) { + if (seed_range_start != tree_state.node_to_seeds[read_num].end() + && seed_range_start->first == node_id) { size_t group_id = seed_range_start->second; - for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { + for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() + && iter->first == node_id; ++iter) { //For each seed on this node, add it to the cluster //And find the shortest distance from any seed to both //ends of the node @@ -402,19 +393,22 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { return node_clusters; } + + //The seeds may for multiple clusters on the node + //Sort the seeds by their offset in the node and split into clusters + + // for all seeds vector> seed_offsets; for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) { - // for all seeds - auto seed_range_start = std::lower_bound( - tree_state.node_to_seeds[read_num].begin(), - tree_state.node_to_seeds[read_num].end(), - std::pair(node_id, 0)); - if (seed_range_start != tree_state.node_to_seeds[read_num].end()) { - for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { + auto seed_range_start = std::lower_bound( + tree_state.node_to_seeds[read_num].begin(),tree_state.node_to_seeds[read_num].end(), + std::pair(node_id, 0)); + if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) { + for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() + && iter->first == node_id; ++iter) { //For each seed, find its offset pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; - int64_t offset = is_rev(seed) ? node_length - get_offset(seed) - : get_offset(seed) + 1; + int64_t offset = is_rev(seed) ? node_length - get_offset(seed) : get_offset(seed) + 1; node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left); node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right); @@ -465,6 +459,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { } else { //This becomes a new read cluster if (read_last_cluster[read_num] != -1) { + //Record the previous cluster node_clusters.read_cluster_heads.emplace(read_num, read_last_cluster[read_num]); } read_last_cluster[read_num] = std::get<1>(s); @@ -546,6 +541,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { TreeState& tree_state, size_t chain_index_i) const { /* * Find all the clusters in the given chain + * Iterate through snarls and create clusters of positions up to that snarl */ std::map>& snarls_in_chain = @@ -558,6 +554,64 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { << " headed by node " << chain_index.id_in_parent << endl; #endif + auto combine_chain_clusters = [&] (size_t& cluster_group, + vector& combined_group, size_t& fragment_combined_group, + vector& combined_left, vector& combined_right, + pair& dists, + vector>& to_erase, int64_t& fragment_dist,int64_t& read_dist, + size_t& read_num){ + //Compare and combine the given cluster_group with the read and fragment combined cluster + //Update the distances of the read combined cluster, if combined + //Returns true if the cluster got combined with a read cluster + + if (read_dist != -1 && read_dist <= tree_state.read_distance_limit){ + //If this chain cluster's rightmost seed is close enough + //to the leftmost seed of any cluster in the snarl, then + //this chain cluster is in the combined cluster + + if (combined_group[read_num] == -1) { + //New chain cluster + combined_group[read_num] = cluster_group; + combined_left[read_num] = dists.first; + combined_right[read_num] = dists.second; + } else { + //Combine + tree_state.read_union_find[read_num].union_groups(combined_group[read_num], cluster_group); + size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_group); + if (new_group == cluster_group) { + to_erase.emplace_back(read_num,combined_group[read_num]); + } else { + to_erase.emplace_back(read_num, cluster_group); + } + combined_group[read_num] = new_group; + combined_left[read_num] = min_not_minus_one(combined_left[read_num], dists.first); + combined_right[read_num] = min_not_minus_one(combined_right[read_num], dists.second); + } + cerr << "COMBINING READ: " ; + if (tree_state.fragment_distance_limit != 0) { + if (fragment_combined_group != -1) { + tree_state.fragment_union_find.union_groups(fragment_combined_group, + cluster_group+tree_state.read_index_offsets[read_num]); + } + fragment_combined_group = tree_state.fragment_union_find.find_group( + cluster_group+tree_state.read_index_offsets[read_num]); + cerr << " AND FRAGMENT" << endl; + } + return true; + } else if (fragment_dist != -1 && + fragment_dist <= tree_state.fragment_distance_limit) { + //If this is a new read cluster but the same fragment cluster + if (fragment_combined_group != -1) { + tree_state.fragment_union_find.union_groups(fragment_combined_group, cluster_group+tree_state.read_index_offsets[read_num]); + } + fragment_combined_group = tree_state.fragment_union_find.find_group(cluster_group+tree_state.read_index_offsets[read_num]); + + return false; + } + return false; + }; + + auto combine_snarl_clusters = [&] (size_t& new_group, size_t& combined_group, size_t& fragment_combined_group, vector>& to_erase, int64_t fragment_dist,int64_t read_dist, @@ -571,7 +625,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { combined_group = new_group; } else { //Union the two groups - combined_group = tree_state.read_union_find[read_num].find_group(combined_group); tree_state.read_union_find[read_num].union_groups(combined_group, new_group); //Find the new distances of the combined groups pair& old_dists = @@ -589,7 +642,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { dists = make_pair( min_not_minus_one(old_dists.first, dists.first), min_not_minus_one(old_dists.second, dists.second)); - tree_state.read_cluster_dists[read_num][new_group] = dists; tree_state.read_cluster_dists[read_num][combined_group] = dists; #ifdef DEBUG_CLUSTER cerr << " New dists for read num " << read_num << ": " @@ -598,16 +650,16 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { #endif } - if (tree_state.fragment_distance_limit != 0) { + if (tree_state.fragment_distance_limit != 0 && fragment_dist != -1) { if (fragment_combined_group != -1) { - //If we're keeping track of fragment clusters, union this + //If we're also keeping track of fragment clusters tree_state.fragment_union_find.union_groups(fragment_combined_group, new_group + tree_state.read_index_offsets[read_num]); } fragment_combined_group = tree_state.fragment_union_find.find_group( new_group + tree_state.read_index_offsets[read_num]); } - } else if (tree_state.fragment_distance_limit != 0 && + } else if (tree_state.fragment_distance_limit != 0 && fragment_dist != -1 && fragment_dist <= tree_state.fragment_distance_limit) { //If these aren't in the same read cluster but are in //the same fragment cluster @@ -745,8 +797,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { #endif - //Need to remember this to check if snarl clusters overlap the old - //best distance + //Remember the distances of the chain clusters, since we will be writing over them + //as we go int64_t fragment_chain_right = chain_clusters.fragment_best_right; vector read_chain_right = std::move(chain_clusters.read_best_right); @@ -768,19 +820,18 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { chain_clusters.read_best_right.assign(tree_state.all_seeds->size(), -1); for (pair cluster_head : snarl_clusters.read_cluster_heads) { // For each of the clusters for the current snarl, - // first check if it can be combined with any other - // snarl clusters by taking loops in the chain, + // first check if it can be combined with another cluster + // in the same snarl by taking loops in the chain, // then, find if it belongs to the new combined cluster // that includes chain clusters size_t read_num = cluster_head.first; pair snarl_dists = - std::move(tree_state.read_cluster_dists[read_num][cluster_head.second]); + std::move(tree_state.read_cluster_dists[read_num][cluster_head.second]); if (loop_dist_start != -1) { //If there is a loop going out and back into the start of - //the snarl, might combine this cluster with other snarl - //clusters + //the snarl, this cluster may be combined with other snarl clusters //The distance to the right side of the snarl // that is found by taking the leftmost seed and @@ -807,20 +858,21 @@ cerr << " (Possibly) updating looping distance to right of snarl cluster " << r cerr << " Combining this cluster from the left " ; #endif int64_t read_dist = snarl_clusters.read_best_left[read_num] == -1 ? -1 : - snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1; - combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], fragment_snarl_cluster_left, - to_erase, snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1, - read_dist, snarl_dists, read_num); + snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1; + int64_t fragment_dist = snarl_clusters.fragment_best_left == -1 ? -1 : + snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1; + + combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], + fragment_snarl_cluster_left, to_erase, fragment_dist, read_dist, snarl_dists, read_num); } } if (loop_dist_end != -1) { //If there is a loop to the right - int64_t new_left = snarl_dists.second == -1 || loop_dist_end == -1 - ? -1 + int64_t new_left = snarl_dists.second == -1 || loop_dist_end == -1 ? -1 : snarl_dists.second + loop_dist_end + snarl_length - end_length; - if (snarl_dists.first == -1 || (new_left != -1 & new_left < snarl_dists.first)){ + if (snarl_dists.first == -1 || (new_left != -1 && new_left < snarl_dists.first)){ //If this is an improvement, update distances snarl_dists.first = new_left; snarl_clusters.read_best_left[read_num] = @@ -833,73 +885,37 @@ cerr << "Updating looping distance to left of snarl cluster " << read_num << ":" #endif } - if (snarl_clusters.fragment_best_right != -1 && snarl_dists.second != -1 ) { + if (snarl_clusters.fragment_best_right != -1 && snarl_dists.second != -1) { //If this cluster can be combined with another cluster //from the right #ifdef DEBUG_CLUSTER -cerr << " Combining this cluster from the right" << endl; +cerr << " Maybe combining this cluster from the right" << endl; #endif int64_t read_dist = snarl_clusters.read_best_right[read_num] == -1 ? -1 : snarl_clusters.read_best_right[read_num] + snarl_dists.second + loop_dist_end - end_length - 1; + int64_t fragment_dist = snarl_clusters.fragment_best_right == -1 ? -1 : + snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1; + combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num], - fragment_snarl_cluster_right, to_erase, - snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1, + fragment_snarl_cluster_right, to_erase,fragment_dist, read_dist, snarl_dists, read_num); } } //Now check if this snarl cluster can be combined with any //existing chain clusters - if (read_chain_right[read_num] != -1 && snarl_dists.first != -1 && - snarl_dists.first + read_chain_right[read_num] - start_length-1 - <= tree_state.read_distance_limit) { - //If this snarl cluster's leftmost seed is close enough to - //the rightmost seed in the chain (up to this point), then - //this snarl cluster is in the combined cluster - - if (combined_cluster[read_num] == -1) { - combined_cluster[read_num] = cluster_head.second; - combined_left[read_num] = snarl_dists.first == -1 ? -1 : - snarl_dists.first + add_dist_left; - combined_right[read_num] = snarl_dists.second; - } else { - //Cluster - tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second); - size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second); - - if (new_group == cluster_head.second) { - to_erase.emplace_back(read_num,combined_cluster[read_num]); - } else { - to_erase.push_back(cluster_head); - } - - combined_cluster[read_num] = new_group; - combined_left[read_num] = min_not_minus_one(combined_left[read_num], - snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left); - combined_right[read_num] = min_not_minus_one(combined_right[read_num],snarl_dists.second); - } - if (tree_state.fragment_distance_limit != 0) { - if (fragment_combined_cluster != -1) { - //Also cluster by fragment - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, - cluster_head.second+tree_state.read_index_offsets[read_num]); - } - fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); - } - } else { - //If the snarl cluster does not get combined with any of - //the existing chain clusters, then it becomes a new chain cluster - if (tree_state.fragment_distance_limit != 0 && fragment_chain_right != -1 && snarl_dists.first != -1 && - snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.fragment_distance_limit) { - //Cluster in the same fragment but not the same read - if (fragment_combined_cluster != -1) { - //Also cluster by fragment - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, - cluster_head.second+tree_state.read_index_offsets[read_num]); - } - fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); - } + int64_t read_dist = read_chain_right[read_num] == -1 || snarl_dists.first == -1 ? -1 : + snarl_dists.first + read_chain_right[read_num] - start_length-1; + int64_t fragment_dist = tree_state.fragment_distance_limit == 0 || fragment_chain_right == -1 || snarl_dists.first == -1 + ? -1 : snarl_dists.first+fragment_chain_right-start_length-1; + pair new_snarl_dists (snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left, + snarl_dists.second); + bool combined_read = combine_chain_clusters (cluster_head.second,combined_cluster, fragment_combined_cluster, + combined_left, combined_right, new_snarl_dists, to_erase, fragment_dist, read_dist, read_num); + + if ( ! combined_read) { + //Create new chain cluster from snarl cluster to_add.push_back(cluster_head); //Update its distances to the correct nodes in the chain pair d = make_pair(snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left, @@ -919,54 +935,21 @@ cerr << " Combining this cluster from the right" << endl; //if they get combined with snarl clusters for (pair cluster_head : chain_clusters.read_cluster_heads) { //For each old chain cluster + + pair& chain_dists = tree_state.read_cluster_dists[cluster_head.first][cluster_head.second]; size_t read_num = cluster_head.first; - pair& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second]; - if (snarl_clusters.read_best_left[read_num] != -1 && chain_dists.second != -1 - && chain_dists.second + snarl_clusters.read_best_left[read_num] - - start_length-1 <= tree_state.read_distance_limit){ - //If this chain cluster's rightmost seed is close enough - //to the leftmost seed of any cluster in the snarl, then - //this chain cluster is in the combined cluster + int64_t read_dist = snarl_clusters.read_best_left[read_num] == -1 || chain_dists.second == -1 ? -1 : + chain_dists.second + snarl_clusters.read_best_left[read_num] - start_length-1 ; + int64_t fragment_dist = tree_state.fragment_distance_limit == 0 || + snarl_clusters.fragment_best_left == -1 || chain_dists.second == -1 ? -1 : + chain_dists.second + snarl_clusters.fragment_best_left - start_length-1; + pair new_chain_dists (chain_dists.first, chain_dists.second + dist_to_end); + bool combined_read = combine_chain_clusters (cluster_head.second,combined_cluster, fragment_combined_cluster, + combined_left, combined_right, new_chain_dists, to_erase, fragment_dist, read_dist, cluster_head.first); - if (combined_cluster[read_num] == -1) { - //New chain cluster - combined_cluster[read_num] = cluster_head.second; - combined_left[read_num] = chain_dists.first; - combined_right[read_num] = chain_dists.second + dist_to_end; - } else { - //Combine - tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second); - size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second); - if (new_group == cluster_head.second) { - to_erase.emplace_back(read_num,combined_cluster[read_num]); - } else { - to_erase.push_back(cluster_head); - } - combined_cluster[read_num] = new_group; - combined_left[read_num] = min_not_minus_one(combined_left[read_num], chain_dists.first); - combined_right[read_num] = min_not_minus_one(combined_right[read_num], chain_dists.second + dist_to_end); - } - if (tree_state.fragment_distance_limit != 0) { - if (fragment_combined_cluster != -1) { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]); - } - fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); - } - } else { - //If this chain cluster is on its own, extend its right - //distance to the end of the current snarl - if (tree_state.fragment_distance_limit != 0 && - snarl_clusters.fragment_best_left != -1 && chain_dists.second != -1 - && chain_dists.second + snarl_clusters.fragment_best_left - - start_length-1 <= tree_state.fragment_distance_limit) { - //If this is a new read cluster but the same fragment cluster - if (fragment_combined_cluster != -1) { - tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]); - } - fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]); - } + if (!combined_read) { chain_dists.second += dist_to_end; if ((tree_state.fragment_distance_limit == 0 && chain_dists.first - 2 >= tree_state.read_distance_limit && diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp index 92c6b604f46..e35f877d4a7 100644 --- a/src/seed_clusterer.hpp +++ b/src/seed_clusterer.hpp @@ -225,10 +225,10 @@ class SnarlSeedClusterer { //Cluster all the snarls at the current level and update the tree_state //to add each of the snarls to the parent level - void cluster_snarls(TreeState& tree_state, size_t depth) const; + void cluster_snarl_level(TreeState& tree_state, size_t depth) const; //Cluster all the chains at the current level - void cluster_chains(TreeState& tree_state, size_t depth) const; + void cluster_chain_level(TreeState& tree_state, size_t depth) const; //Cluster the seeds on the specified node NodeClusters cluster_one_node(TreeState& tree_state, From 3bc83bc81805287d61dc9ec178b76eb4d13dd295 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 12 Nov 2019 09:17:06 -0500 Subject: [PATCH 39/79] Revert "let poisson caller use min-ad and het-bias from old caller for now (hopefully temporary)" This reverts commit fd5b96731d4502817366baef5e3c5555c1063899. --- src/snarl_caller.cpp | 131 ++++++++++++++--------------------- src/snarl_caller.hpp | 38 +++++----- src/subcommand/call_main.cpp | 13 ++-- 3 files changed, 79 insertions(+), 103 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 475b6454666..d22171ba617 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -54,19 +54,6 @@ void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double m } } -void SupportBasedSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) { - // want to move away from ugly hacks that treat the reference traversal differently, - // so keep all these set the same - if (het_bias >= 0) { - max_het_bias = het_bias; - max_ref_het_bias = het_bias; - max_indel_het_bias = het_bias; - } - if (ref_het_bias >= 0) { - max_ref_het_bias = ref_het_bias; - } -} - int SupportBasedSnarlCaller::get_best_support(const vector& supports, const vector& skips) { int best_allele = -1; for(size_t i = 0; i < supports.size(); i++) { @@ -78,38 +65,6 @@ int SupportBasedSnarlCaller::get_best_support(const vector& supports, c return best_allele; } -double SupportBasedSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, - int second_best_trav, int ref_trav_idx) const { - bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || - (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx])); - - double bias_limit = 1; - - if (best_trav >= 0 && second_best_trav >=0) { - if (best_trav == ref_trav_idx) { - // Use ref bias limit - - // We decide closeness differently depending on whether best is ref - // or not. In practice, we use this to slightly penalize homozygous - // ref calls (by setting max_ref_het_bias higher than max_het_bias) - // and rather make a less supported alt call instead. This boost - // max sensitivity, and because everything is homozygous ref by - // default in VCF, any downstream filters will effectively reset - // these calls back to homozygous ref. TODO: This shouldn't apply - // when off the primary path! - bias_limit = max_ref_het_bias; - } else if (is_indel) { - // This is an indel - // Use indel bias limit - bias_limit = max_indel_het_bias; - } else { - // Use normal het bias limit - bias_limit = max_het_bias; - } - } - return bias_limit; -} - RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, TraversalSupportFinder& support_finder) : SupportBasedSnarlCaller(graph, snarl_manager, support_finder) { @@ -119,6 +74,19 @@ RatioSupportSnarlCaller::~RatioSupportSnarlCaller() { } +void RatioSupportSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) { + // want to move away from ugly hacks that treat the reference traversal differently, + // so keep all these set the same + if (het_bias >= 0) { + max_het_bias = het_bias; + max_ref_het_bias = het_bias; + max_indel_het_bias = het_bias; + } + if (ref_het_bias >= 0) { + max_ref_het_bias = ref_het_bias; + } +} + vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, const vector& traversals, int ref_trav_idx, @@ -450,6 +418,39 @@ function RatioSupportSnarlCaller::get_skip_allele_f }; } +double RatioSupportSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, + int second_best_trav, int ref_trav_idx) const { + bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || + (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx])); + + double bias_limit = 1; + + if (best_trav >= 0 && second_best_trav >=0) { + if (best_trav == ref_trav_idx) { + // Use ref bias limit + + // We decide closeness differently depending on whether best is ref + // or not. In practice, we use this to slightly penalize homozygous + // ref calls (by setting max_ref_het_bias higher than max_het_bias) + // and rather make a less supported alt call instead. This boost + // max sensitivity, and because everything is homozygous ref by + // default in VCF, any downstream filters will effectively reset + // these calls back to homozygous ref. TODO: This shouldn't apply + // when off the primary path! + bias_limit = max_ref_het_bias; + } else if (is_indel) { + // This is an indel + // Use indel bias limit + bias_limit = max_indel_het_bias; + } else { + // Use normal het bias limit + bias_limit = max_het_bias; + } + } + return bias_limit; +} + + PoissonSupportSnarlCaller::PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, TraversalSupportFinder& support_finder, const algorithms::BinnedDepthIndex& depth_index) : @@ -562,7 +563,7 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, double best_genotype_likelihood = -numeric_limits::max(); vector best_genotype; for (const auto& candidate : candidates) { - double gl = genotype_likelihood(candidate.first, candidate.second, traversals, traversal_sizes, ref_trav_idx, exp_depth, depth_err); + double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err); if (gl > best_genotype_likelihood) { best_genotype_likelihood = gl; best_genotype = candidate.first; @@ -578,7 +579,6 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotype, const vector& genotype_supports, const vector& traversals, - const vector& traversal_sizes, int ref_trav_idx, double exp_depth, double depth_err) { assert(genotype_supports.size() == genotype.size()); @@ -602,24 +602,6 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size(); } } - - // we preserve our het-bias from RatioSupportSnarlCaller as something prior-like here - // todo: use something better - double het_prior = 0.; - double ew_prior = 0.; - if (genotype.size() == 2) { - double het_bias = get_bias(traversal_sizes, genotype[0], genotype[1], ref_trav_idx); - if (genotype[0] != genotype[1]) { - // if the het_bias is greater than 1 (usually it's 6 by default), then - // we get a prior of 5/6 for a het - het_prior = log(1. - 1. / het_bias); - ew_prior = ewens_af_prob_ln({2, 0}, 0.001); - } else { - // and 1/6 for a hom - het_prior = log(1. / het_bias); - ew_prior = ewens_af_prob_ln({0, 1}, 0.001); - } - } // total support of the site Support total_site_support = total_other_support; @@ -661,12 +643,11 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp } #ifdef debug - cerr << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood << " prior " << het_prior - << " ew prior " << ew_prior - << " total-prob " << (alleles_log_likelihood + other_log_likelihood + het_prior) << endl; + cerr << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood + << " total-prob " << (alleles_log_likelihood + other_log_likelihood) << endl; #endif - return alleles_log_likelihood + other_log_likelihood + het_prior; + return alleles_log_likelihood + other_log_likelihood; } void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, @@ -687,8 +668,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } double total_site_depth = support_val(site_support); - vector traversal_sizes = support_finder.get_traversal_sizes(traversals); - // Set the variant's total depth string depth_string = std::to_string((int64_t)round(total_site_depth)); variant.format.push_back("DP"); @@ -700,7 +679,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // get the allele depths variant.format.push_back("AD"); set called_allele_set(genotype.begin(), genotype.end()); - double min_site_support = genotype.size() > 0 ? INFINITY : 0; for (int i = 0; i < traversals.size(); ++i) { vector shared_travs; @@ -720,10 +698,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // there is certainly room for optimization via remembering some of this stuff here vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false); variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i])))); - if (in_genotype) { - // update the minimum support - min_site_support = min(min_site_support, total(allele_supports[i])); - } } // get the genotype likelihoods @@ -757,7 +731,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false); genotype_supports.push_back(gt_supports[j]); } - double gl = genotype_likelihood({i, j}, genotype_supports, traversals, traversal_sizes, 0, exp_depth, depth_err); + double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err); // convert from natural log to log10 by dividing by ln(10) variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); @@ -769,9 +743,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } } - // use old quality for now - variant.quality = min_site_support; - // todo /* // Now do the filters diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index e2d7aa78f5c..1c3deace343 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -75,8 +75,6 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Set some of the parameters void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support); - - void set_het_bias(double het_bias, double ref_het_bias = 0.); /// Get the traversal support finder const TraversalSupportFinder& get_support_finder() const; @@ -92,12 +90,6 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Relic from old code static double support_val(const Support& support) { return total(support); }; - /// Get the bias used to for comparing two traversals - /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel - /// see tuning parameters below) - double get_bias(const vector& traversal_sizes, int best_trav, - int second_best_trav, int ref_trav_idx) const; - const PathHandleGraph& graph; SnarlManager& snarl_manager; @@ -115,15 +107,6 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// what's the minimum total support (over all alleles) of the site to make /// a call size_t min_site_depth = 3; - /// What fraction of the reads supporting an alt are we willing to discount? - /// At 2, if twice the reads support one allele as the other, we'll call - /// homozygous instead of heterozygous. At infinity, every call will be - /// heterozygous if even one read supports each allele. - double max_het_bias = 6; - /// Like above, but applied to ref / alt ratio (instead of alt / ref) - double max_ref_het_bias = 6; - /// Like the max het bias, but applies to novel indels. - double max_indel_het_bias = 6; }; @@ -137,6 +120,9 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { TraversalSupportFinder& support_finder); virtual ~RatioSupportSnarlCaller(); + /// Set some of the parameters + void set_het_bias(double het_bias, double ref_het_bias = 0.); + /// Get the genotype of a site virtual vector genotype(const Snarl& snarl, const vector& traversals, @@ -160,12 +146,27 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { protected: + /// Get the bias used to for comparing two traversals + /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel + /// see tuning parameters below) + double get_bias(const vector& traversal_sizes, int best_trav, + int second_best_trav, int ref_trav_idx) const; + /// get a map of the beginning of a node (in forward orientation) on a traversal /// used for up-weighting large deletion edges in complex snarls with average support unordered_map get_ref_offsets(const SnarlTraversal& ref_trav) const; /// Tuning + /// What fraction of the reads supporting an alt are we willing to discount? + /// At 2, if twice the reads support one allele as the other, we'll call + /// homozygous instead of heterozygous. At infinity, every call will be + /// heterozygous if even one read supports each allele. + double max_het_bias = 6; + /// Like above, but applied to ref / alt ratio (instead of alt / ref) + double max_ref_het_bias = 6; + /// Like the max het bias, but applies to novel indels. + double max_indel_het_bias = 6; /// Used for calling 1/2 calls. If both alts (times this bias) are greater than /// the reference, the call is made. set to 0 to deactivate. double max_ma_bias = 0; @@ -217,14 +218,13 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { double genotype_likelihood(const vector& genotype, const vector& genotype_supports, const vector& traversals, - const vector& traversal_sizes, int ref_trav_idx, double exp_depth, double depth_err); /// Rank supports vector rank_by_support(const vector& supports); /// Baseline mapping error rate (gets added to the standard error from coverage) - double baseline_mapping_error = 0.005; + double baseline_mapping_error = 0.05; /// Consider up to the top-k traversals (based on support) for genotyping size_t top_k = 25; diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index 4a611c67303..3f69185b0c9 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -224,6 +224,11 @@ int main_call(int argc, char** argv) { cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl; return 1; } + // Check bias option + if (!bias_string.empty() && !ratio_caller) { + cerr << "error [vg call]: -b can only be used with -B" << endl; + return 1; + } // No paths specified: use them all if (ref_paths.empty()) { @@ -268,21 +273,21 @@ int main_call(int argc, char** argv) { if (ratio_caller == false) { // Make a depth index - depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 50000, 0, true, true); + depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true); // Make a new-stype probablistic caller auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index); packed_caller = poisson_caller; } else { // Make an old-style ratio support caller auto ratio_caller = new RatioSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder); + if (het_bias >= 0) { + ratio_caller->set_het_bias(het_bias, ref_het_bias); + } packed_caller = ratio_caller; } if (min_allele_support >= 0) { packed_caller->set_min_supports(min_allele_support, min_allele_support, min_site_support); } - if (het_bias >= 0) { - packed_caller->set_het_bias(het_bias, ref_het_bias); - } snarl_caller = unique_ptr(packed_caller); } From 4713e296b3e189ee6a147dbd0a32744fc6be9954 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 12 Nov 2019 09:28:30 -0500 Subject: [PATCH 40/79] fix bug in support splitting and other tweaks --- src/snarl_caller.cpp | 21 +++++++++++---------- src/snarl_caller.hpp | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index d22171ba617..f96afd7c2f1 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -596,8 +596,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp // split the homozygous support into two // from now on we'll treat it like two separate observations, each with half coverage vector fixed_genotype_supports = genotype_supports; - if (std::equal(genotype_supports.begin() + 1, genotype_supports.end(), genotype_supports.begin(), - [&](const Support& s1, const Support& s2) { return support_val(s1) == support_val(s2); })) { + if (std::equal(genotype.begin() + 1, genotype.end(), genotype.begin())) { for (int i = 0; i < genotype_supports.size(); ++i) { fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size(); } @@ -679,6 +678,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // get the allele depths variant.format.push_back("AD"); set called_allele_set(genotype.begin(), genotype.end()); + double min_site_support = genotype.size() > 0 ? INFINITY : 0; for (int i = 0; i < traversals.size(); ++i) { vector shared_travs; @@ -698,6 +698,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // there is certainly room for optimization via remembering some of this stuff here vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false); variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i])))); + if (in_genotype) { + // update the minimum support + min_site_support = min(min_site_support, total(allele_supports[i])); + } } // get the genotype likelihoods @@ -743,22 +747,19 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } } - // todo - /* + // use old quality for now + variant.quality = min_site_support; + // Now do the filters + // todo: fix and share with other caller variant.filter = "PASS"; if (min_site_support < min_mad_for_filter) { // Apply Min Allele Depth cutoff across all alleles (even ref) variant.filter = "lowad"; - } else if (min_ad_log_likelihood_for_filter != 0 && - ad_log_likelihood < min_ad_log_likelihood_for_filter) { - // We have a het, but the assignment of reads between the two branches is just too weird - variant.filter = "lowxadl"; - } else if ((int64_t)round(total(total_support)) < min_site_depth) { + } else if ((int64_t)round(total_site_depth) < min_site_depth) { // we don't have enough support to want to make a call variant.filter = "lowdepth"; } - */ } void PoissonSupportSnarlCaller::update_vcf_header(string& header) const { diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 1c3deace343..e9e92571c09 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -224,7 +224,7 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { vector rank_by_support(const vector& supports); /// Baseline mapping error rate (gets added to the standard error from coverage) - double baseline_mapping_error = 0.05; + double baseline_mapping_error = 0.005; /// Consider up to the top-k traversals (based on support) for genotyping size_t top_k = 25; From 42aba1d7b130cbee588e1b94ab7c54b65a6b9ef5 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 12 Nov 2019 11:40:33 -0500 Subject: [PATCH 41/79] clean up some traversal support computation --- src/snarl_caller.cpp | 152 +++++++++++++------------------------- src/snarl_caller.hpp | 1 - src/traversal_support.cpp | 65 ++++++++++++---- src/traversal_support.hpp | 32 +++++--- 4 files changed, 126 insertions(+), 124 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index f96afd7c2f1..b8ae480e2f6 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -102,7 +102,7 @@ vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, vector traversal_sizes = support_finder.get_traversal_sizes(traversals); // get the supports of each traversal independently - vector supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx); + vector supports = support_finder.get_traversal_set_support(traversals, {}, {}, false, false, false, ref_trav_idx); int best_allele = get_best_support(supports, {}); #ifdef debug @@ -117,7 +117,7 @@ vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, // we prune out traversals whose exclusive support (structure that is not shared with best traversal) // doesn't meet a certain cutoff - vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx); + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, true, false, false, ref_trav_idx); vector skips = {best_allele}; for (int i = 0; i < secondary_exclusive_supports.size(); ++i) { double bias = get_bias(traversal_sizes, i, best_allele, ref_trav_idx); @@ -130,7 +130,7 @@ vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, } } // get the supports of each traversal in light of best - vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx); + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, false, false, false, ref_trav_idx); int second_best_allele = get_best_support(secondary_supports, {skips}); // get the supports of each traversal in light of second best @@ -139,7 +139,7 @@ vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, int third_best_allele = -1; if (second_best_allele != -1) { // prune out traversals whose exclusive support relative to second best doesn't pass cut - vector tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, false, ref_trav_idx); + vector tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, {}, true, false, false, ref_trav_idx); skips.push_back(best_allele); skips.push_back(second_best_allele); for (int i = 0; i < tertiary_exclusive_supports.size(); ++i) { @@ -148,7 +148,7 @@ vector RatioSupportSnarlCaller::genotype(const Snarl& snarl, skips.push_back(i); } } - tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, false, ref_trav_idx); + tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, {}, false, false, false, ref_trav_idx); third_best_allele = get_best_support(tertiary_supports, skips); } @@ -298,10 +298,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } // compute the support of our called alleles // todo: I think this undercounts support. shuold be fixed (as in Poisson version) - vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, false, 0); - - // get the support of our uncalled alleles, making sure to not include any called support - vector uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, 0); + vector allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0); // Set up the depth format field variant.format.push_back("DP"); @@ -314,7 +311,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, variant.format.push_back("XAAD"); // Compute the total support for all the alts that will be appearing - Support total_support; + Support total_support = support_finder.get_total_traversal_set_support(traversals, 0); // And total alt allele depth for the alt alleles Support alt_support; // Find the min total support of anything called @@ -323,14 +320,11 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, if (!allele_supports.empty()) { //only add info if we made a call for (int allele = 0; allele < traversals.size(); ++allele) { bool is_called = called_allele_set.count(allele); - auto& support = is_called ? allele_supports[allele] : uncalled_supports[allele]; + auto& support = allele_supports[allele]; // Set up allele-specific stats for the allele variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(total(support)))); - // Sum up into total depth - total_support += support; - if (allele != 0) { // It's not the primary reference allele alt_support += support; @@ -482,14 +476,14 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, vector traversal_sizes = support_finder.get_traversal_sizes(traversals); // get the supports of each traversal independently - vector supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx); + vector supports = support_finder.get_traversal_set_support(traversals, {}, {}, false, false, false, ref_trav_idx); // sort the traversals by support vector ranked_traversals = rank_by_support(supports); size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size()); // the candidate genotypes and their supports. the numbers here are alleles as indexed in traversals[] - map, vector> candidates; + set> candidates; // pre-filter out some alleles based on poor exclusive support set skips; @@ -504,13 +498,13 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, } if (ploidy == 1) { - candidates[{best_allele}] = {supports[best_allele]}; + candidates.insert({best_allele}); } else { assert(ploidy == 2); // we prune out traversals whose exclusive support (structure that is not shared with best traversal) // doesn't meet a certain cutoff - vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx); + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, true, false, false, ref_trav_idx); for (int j = 0; j < secondary_exclusive_supports.size(); ++j) { if (j != best_allele && support_val(secondary_exclusive_supports[j]) < min_total_support_for_call && @@ -520,32 +514,21 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, } // get the supports of each traversal in light of best - vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx); + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, false, false, false, ref_trav_idx); vector ranked_secondary_traversals = rank_by_support(secondary_supports); // add the homozygous genotype for our best allele - candidates[{best_allele, best_allele}] = {supports[best_allele], supports[best_allele]}; + candidates.insert({best_allele, best_allele}); // now look at the top-k second-best traversals size_t sec_count = 0; for (int j = 0; j < ranked_secondary_traversals.size() && sec_count < top_k; ++j) { int second_best_allele = ranked_secondary_traversals[j]; if (!skips.count(second_best_allele) && second_best_allele != best_allele) { - // second best allele's support, sharing nodes with best - Support& second_best_support = secondary_supports[second_best_allele]; - // best allele's support, sharing nodes with second best - Support best_support_het = support_finder.get_traversal_set_support( - {traversals[best_allele], traversals[second_best_allele]}, - {1}, false, false, false, ref_trav_idx)[0]; - // canonical ordering for our set - if (best_allele < second_best_allele) { - candidates[{best_allele, second_best_allele}] = {best_support_het, second_best_support}; - } else { - candidates[{second_best_allele, best_allele}] = {second_best_support, best_support_het}; - } + candidates.insert({min(best_allele, second_best_allele), max(best_allele, second_best_allele)}); // also make sure we have our homozygous genotype for the second best allele - candidates[{second_best_allele, second_best_allele}] = {supports[second_best_allele], supports[second_best_allele]}; + candidates.insert({second_best_allele, second_best_allele}); ++sec_count; } } @@ -563,10 +546,10 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, double best_genotype_likelihood = -numeric_limits::max(); vector best_genotype; for (const auto& candidate : candidates) { - double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err); + double gl = genotype_likelihood(candidate, traversals, ref_trav_idx, exp_depth, depth_err); if (gl > best_genotype_likelihood) { best_genotype_likelihood = gl; - best_genotype = candidate.first; + best_genotype = candidate; } } @@ -577,20 +560,26 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, } double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotype, - const vector& genotype_supports, const vector& traversals, int ref_trav_idx, double exp_depth, double depth_err) { - assert(genotype_supports.size() == genotype.size()); assert(genotype.size() == 1 || genotype.size() == 2); + // get the total support over the site + // todo: bump this to calling method to not recompute for each genotype!!! + Support total_site_support = support_finder.get_total_traversal_set_support(traversals, ref_trav_idx); + + // get the genotype support + // todo : we aren't using the non-genotype allele supports in this method, add flag to not compute them here! + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, ref_trav_idx); - // we need the support of all traversals *not* in the genotype. - Support total_other_support; - // we are running in a mode that will ignore stuff in our genotype, and only count the remainders once. - vector other_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, ref_trav_idx); - for (auto& other_support : other_supports) { - total_other_support += other_support; + // get the total support of traversals *not* in the genotype + // note that if we sum it up from allele_supports, it will likely be underestimated when using min (instead of avg supports) + // so we subtract it out of the total instead + Support total_other_support = total_site_support; + set genotype_set(genotype.begin(), genotype.end()); + for (int allele : genotype_set) { + total_other_support += -1. * genotype_supports[allele]; } // split the homozygous support into two @@ -598,16 +587,10 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp vector fixed_genotype_supports = genotype_supports; if (std::equal(genotype.begin() + 1, genotype.end(), genotype.begin())) { for (int i = 0; i < genotype_supports.size(); ++i) { - fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size(); + fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype.size(); } } - // total support of the site - Support total_site_support = total_other_support; - for (auto& support : fixed_genotype_supports) { - total_site_support += support; - } - // how many reads would we expect to not map to our genotype due to error double error_rate = std::min(0.25, depth_err + baseline_mapping_error); double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support); @@ -631,12 +614,13 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp // now we compute the likelihood of our genotype double alleles_log_likelihood = 0; - for (int i = 0; i < fixed_genotype_supports.size(); ++i) { - double allele_ll = poisson_prob_ln(std::round(support_val(fixed_genotype_supports[i])), allele_poisson_lambda); + for (int allele : genotype) { + const Support& allele_support = fixed_genotype_supports[allele]; + double allele_ll = poisson_prob_ln(std::round(support_val(allele_support)), allele_poisson_lambda); alleles_log_likelihood += allele_ll; #ifdef debug - cerr << " a[" << i <<"]=" << genotype[i] << " sup=" << genotype_supports[i] << " fix-sup=" << fixed_genotype_supports[i] + cerr << " a[" << allele <<"]=" << " sup=" << genotype_supports[allele] << " fix-sup=" << allele_support << " prob " << allele_ll << endl; #endif } @@ -658,14 +642,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, assert(traversals.size() == variant.alleles.size()); // Get the depth of the site - - // get the unique supports (useful only for getting a total) - vector unique_supports = support_finder.get_traversal_set_support(traversals, {}, false, true, true, 0); - Support site_support; - for (const Support& sup : unique_supports) { - site_support += sup; - } - double total_site_depth = support_val(site_support); + // todo: pass this down to genotype_likelihood + + Support total_site_support = support_finder.get_total_traversal_set_support(traversals, 0); + double total_site_depth = support_val(total_site_support); // Set the variant's total depth string depth_string = std::to_string((int64_t)round(total_site_depth)); @@ -677,30 +657,19 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // get the allele depths variant.format.push_back("AD"); - set called_allele_set(genotype.begin(), genotype.end()); + + // get the genotype support + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0); + set genotype_set(genotype.begin(), genotype.end()); double min_site_support = genotype.size() > 0 ? INFINITY : 0; + // update the allele depths for (int i = 0; i < traversals.size(); ++i) { - vector shared_travs; - bool in_genotype = called_allele_set.count(i); - if (in_genotype) { - // if we're in the genotype, then we share support with other alleles. - for (int a : called_allele_set) { - if (a != i) { - shared_travs.push_back(a); - } - } - } else { - // if we're not in the genotype, then we ignore support of everything in the genotype - shared_travs = genotype; - } - // we recompute all supports for each allele to get it's support relative to the genotype - // there is certainly room for optimization via remembering some of this stuff here - vector allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false); - variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i])))); - if (in_genotype) { + Support allele_support = genotype_supports[i]; + variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_support)))); + if (genotype_set.count(i)) { // update the minimum support - min_site_support = min(min_site_support, total(allele_supports[i])); + min_site_support = min(min_site_support, total(genotype_supports[i])); } } @@ -722,28 +691,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // assume ploidy 2 for (int i = 0; i < traversals.size(); ++i) { for (int j = i; j < traversals.size(); ++j) { - vector genotype_supports; - if (i == j) { - // put the full support of allele for each copy of homozygous (genotype method expects this) - vector gt_supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false); - genotype_supports = {gt_supports[i], gt_supports[i]}; - } else { - // compute each support relative to the other - // todo: we can speed this up by saving above, or filtering down traversal list to just our genotype alleles - vector gt_supports = support_finder.get_traversal_set_support(traversals, {j}, false, false, false); - genotype_supports.push_back(gt_supports[i]); - gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false); - genotype_supports.push_back(gt_supports[j]); - } - double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err); + double gl = genotype_likelihood({i, j}, traversals, 0, exp_depth, depth_err); // convert from natural log to log10 by dividing by ln(10) variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); - - // use our likelihood as the VCF quality - // todo: check if there's something more conventional to use - if ((genotype[0] == i && genotype[1] == j) || (genotype[0] == j && genotype[1] == i)) { - variant.quality = logprob_to_phred(gl); - } } } diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index e9e92571c09..62f18563f56 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -216,7 +216,6 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { /// Homozygous alleles are split into two, with half support each /// The (natural) logoarithm is returned double genotype_likelihood(const vector& genotype, - const vector& genotype_supports, const vector& traversals, int ref_trav_idx, double exp_depth, double depth_err); diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp index e3fed43860a..a3ce23539a5 100644 --- a/src/traversal_support.cpp +++ b/src/traversal_support.cpp @@ -60,16 +60,55 @@ tuple TraversalSupportFinder::get_child_support(const Sna Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& traversal) const { - return get_traversal_set_support({traversal}, {}, false, false, false).at(0); + return get_traversal_set_support({traversal}, {}, {}, false, false, false).at(0); +} + +Support TraversalSupportFinder::get_total_traversal_set_support(const vector& traversals, + int ref_trav_idx) const { + // share everything + vector shared_travs(traversals.size()); + for (int i = 0; i < shared_travs.size(); ++i) { + shared_travs[i] = i; + } + + // get the support of everything, where all shared nodes and edges are scaled by the number of times they're shared + vector supports = get_traversal_set_support(traversals, shared_travs, {}, false, false, true, ref_trav_idx); + + // sum it up + Support total; + for (const Support& support : supports) { + total += support; + } + + return total; +} + +vector TraversalSupportFinder::get_traversal_genotype_support(const vector& traversals, + const vector& genotype, + int ref_trav_idx) { + set tgt_trav_set(genotype.begin(), genotype.end()); + vector tgt_travs(tgt_trav_set.begin(), tgt_trav_set.end()); + // get the support of just the alleles in the genotype, evenly splitting shared stuff + vector allele_support = get_traversal_set_support(traversals, tgt_travs, tgt_trav_set, false, false, true, ref_trav_idx); + // get the support of everythin else, treating stuff in the genotype alleles as 0 + vector other_support = get_traversal_set_support(traversals, tgt_travs, {}, false, true, false, ref_trav_idx); + // combine the above two vectors + for (int allele : tgt_travs) { + other_support[allele] = allele_support[allele]; + } + return other_support; } vector TraversalSupportFinder::get_traversal_set_support(const vector& traversals, const vector& shared_travs, + const set& tgt_travs, bool exclusive_only, bool exclusive_count, - bool unique, + bool mutual_shared, int ref_trav_idx) const { - assert(!unique || (exclusive_count || exclusive_only)); + + // mutual_shared only makes sense when everything is shared + assert(!mutual_shared || shared_travs.size() == traversals.size() || shared_travs.size() == tgt_travs.size()); // pass 1: how many times have we seen a node or edge unordered_map node_counts; @@ -139,7 +178,8 @@ vector TraversalSupportFinder::get_traversal_set_support(const vector 0) ? 0. : 1. / (1. + share_count); + double denom_add = mutual_shared ? 0 : 1; + double scale_factor = ((exclusive_only || exclusive_count) && share_count > 0) ? 0. : 1. / (denom_add + share_count); // when looking at exclusive support, we don't normalize by skipped lengths if (scale_factor != 0 || !exclusive_only || exclusive_count) { @@ -156,6 +196,10 @@ vector TraversalSupportFinder::get_traversal_set_support(const vector TraversalSupportFinder::get_traversal_set_support(const vector 0 && visit_idx < trav.visit_size() - 1)) { update_support(trav_idx, min_support, avg_support, length, share_count); @@ -195,15 +235,14 @@ vector TraversalSupportFinder::get_traversal_set_support(const vector& traversals, + int ref_trav_idx = -1) const; + + /// wrapper for using get_traversal_set_support to get the support for + /// some alleles in a genotype, where everything is split evently among them + /// anything not in the genotype gets a support using "exclusive_count" + /// where nodes taken by the genotype are counted as 0 + virtual vector get_traversal_genotype_support(const vector& traversals, + const vector& genotype, + int ref_trav_idx = -1); + + /// traversals: get support for each traversal in this set + /// shared_travs: if a node appears N times in shared_travs, then it will count as 1 / (N+1) support + /// tgt_travs: if not empty, only compute support for these traversals (remaining slots in output vector left 0) + /// eclusive_only: shared_travs are completely ignored + /// exclusive_count: anything in shared_travs treated as 0 + /// mutual_shared: shared_travs count as 1/N support (instead of 1/(N+1)). usefuly for total support + /// ref_trav_idx: index of reference traversal if known virtual vector get_traversal_set_support(const vector& traversals, const vector& shared_travs, + const set& tgt_travs, bool exclusive_only, bool exclusive_count, - bool unique, + bool mutual_shared, int ref_trav_idx = -1) const; /// Get the total length of all nodes in the traversal From 278ec93d3443621fe343e9daa884b548e67994d0 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 12 Nov 2019 15:43:09 -0500 Subject: [PATCH 42/79] float depths. deprecated total support function that doesnt work --- src/algorithms/coverage_depth.cpp | 26 +++++++++++++--- src/algorithms/coverage_depth.hpp | 6 ++-- src/snarl_caller.cpp | 51 ++++++++++++++----------------- src/traversal_support.cpp | 20 ------------ src/traversal_support.hpp | 8 +---- 5 files changed, 48 insertions(+), 63 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index 6d28335f4d7..7429a3b6e0d 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -130,18 +130,18 @@ vector> binned_packed_depth(const Packer& return binned_depths; } -unordered_map>> binned_packed_depth_index(const Packer& packer, +unordered_map>> binned_packed_depth_index(const Packer& packer, const vector& path_names, size_t bin_size, size_t min_coverage, bool include_deletions, bool std_err) { - unordered_map>> depth_index; + unordered_map>> depth_index; for (const string& path_name : path_names) { vector> binned_depths = binned_packed_depth(packer, path_name, bin_size, min_coverage, include_deletions); // todo: probably more efficent to just leave in sorted vector - map>& depth_map = depth_index[path_name]; + map>& depth_map = depth_index[path_name]; for (auto& binned_depth : binned_depths) { double var = get<3>(binned_depth); // optionally convert variance to standard error @@ -155,14 +155,30 @@ unordered_map>> binned_packed_depth_ind return depth_index; } -const pair& get_depth_from_index(const unordered_map>>& depth_index, - const string& path_name, size_t offset) { +const pair& get_depth_from_index(const unordered_map>>& depth_index, + const string& path_name, size_t offset) { auto ub = depth_index.at(path_name).upper_bound(offset); --ub; return ub->second; } +pair get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t start_offset, size_t end_offset) { + auto ub = depth_index.at(path_name).upper_bound(start_offset); + --ub; + auto ub_end = depth_index.at(path_name).upper_bound(end_offset); + size_t count = 0; + pair total = make_pair(0, 0); + for (auto cur = ub; cur != ub_end; ++cur, ++count) { + total.first += cur->second.first; + total.second += cur->second.second; + } + // todo: better way of combining? + total.first /= (double)count; + total.second /= (double)count; + return total; +} + // draw (roughly) max_nodes nodes from the graph using the random seed static unordered_map sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) { default_random_engine generator(random_seed); diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp index 9084949df36..c6e7ed3797f 100644 --- a/src/algorithms/coverage_depth.hpp +++ b/src/algorithms/coverage_depth.hpp @@ -38,7 +38,7 @@ vector> binned_packed_depth(const Packer& /// Use the above function to retrieve the binned depths of a list of paths, and store them indexed by start /// coordinate. If std_err is true, store instead of -using BinnedDepthIndex = unordered_map>>; +using BinnedDepthIndex = unordered_map>>; BinnedDepthIndex binned_packed_depth_index(const Packer& packer, const vector& path_names, size_t bin_size, @@ -47,8 +47,8 @@ BinnedDepthIndex binned_packed_depth_index(const Packer& packer, bool std_err); /// Query index created above -/// Todo: optionally smooth over adjacent bins? -const pair& get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t offset); +const pair& get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t offset); +pair get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t start_offset, size_t end_offset); /// Return the mean and variance of coverage of randomly sampled nodes from a GAM /// Nodes with less than min_coverage are ignored diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index b8ae480e2f6..b9f9356d715 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -297,8 +297,10 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, shared_travs.push_back(genotype[0]); } // compute the support of our called alleles - // todo: I think this undercounts support. shuold be fixed (as in Poisson version) vector allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0); + + // Compute the total support for all the alts that will be appearing + Support total_support = std::accumulate(allele_supports.begin(), allele_supports.end(), Support()); // Set up the depth format field variant.format.push_back("DP"); @@ -309,9 +311,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, variant.format.push_back("XADL"); // Also the alt allele depth variant.format.push_back("XAAD"); - - // Compute the total support for all the alts that will be appearing - Support total_support = support_finder.get_total_traversal_set_support(traversals, 0); + // And total alt allele depth for the alt alleles Support alt_support; // Find the min total support of anything called @@ -536,10 +536,9 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, } // expected depth from our coverage - const pair& start_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first); - const pair& end_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.second); - double exp_depth = (start_depth.first + end_depth.first) / 2.; - double depth_err = (start_depth.second + end_depth.second) / 2.; + auto depth_info = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first, ref_range.second); + double exp_depth = depth_info.first; + double depth_err = depth_info.second; assert(!isnan(exp_depth) && !isnan(depth_err)); // genotype (log) likelihoods @@ -565,23 +564,21 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp assert(genotype.size() == 1 || genotype.size() == 2); - // get the total support over the site - // todo: bump this to calling method to not recompute for each genotype!!! - Support total_site_support = support_finder.get_total_traversal_set_support(traversals, ref_trav_idx); - // get the genotype support - // todo : we aren't using the non-genotype allele supports in this method, add flag to not compute them here! vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, ref_trav_idx); + // get the total support over the site + Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); + // get the total support of traversals *not* in the genotype - // note that if we sum it up from allele_supports, it will likely be underestimated when using min (instead of avg supports) - // so we subtract it out of the total instead - Support total_other_support = total_site_support; + Support total_other_support; set genotype_set(genotype.begin(), genotype.end()); - for (int allele : genotype_set) { - total_other_support += -1. * genotype_supports[allele]; + for (int i = 0; i < traversals.size(); ++i) { + if (!genotype_set.count(i)) { + total_other_support += genotype_supports[i]; + } } - + // split the homozygous support into two // from now on we'll treat it like two separate observations, each with half coverage vector fixed_genotype_supports = genotype_supports; @@ -641,10 +638,11 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, assert(traversals.size() == variant.alleles.size()); - // Get the depth of the site - // todo: pass this down to genotype_likelihood + // get the genotype support + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0); - Support total_site_support = support_finder.get_total_traversal_set_support(traversals, 0); + // Get the depth of the site + Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); double total_site_depth = support_val(total_site_support); // Set the variant's total depth @@ -658,8 +656,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // get the allele depths variant.format.push_back("AD"); - // get the genotype support - vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0); set genotype_set(genotype.begin(), genotype.end()); double min_site_support = genotype.size() > 0 ? INFINITY : 0; @@ -682,10 +678,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // expected depth from our coverage pair ref_range = make_pair(variant.position, variant.position + variant.ref.length()); - const pair& start_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first); - const pair& end_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.second); - double exp_depth = (start_depth.first + end_depth.first) / 2.; - double depth_err = (start_depth.second + end_depth.second) / 2.; + auto depth_info = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first, ref_range.second); + double exp_depth = depth_info.first; + double depth_err = depth_info.second; assert(!isnan(exp_depth) && !isnan(depth_err)); // assume ploidy 2 diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp index a3ce23539a5..2bcb478b7c6 100644 --- a/src/traversal_support.cpp +++ b/src/traversal_support.cpp @@ -63,26 +63,6 @@ Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& trav return get_traversal_set_support({traversal}, {}, {}, false, false, false).at(0); } -Support TraversalSupportFinder::get_total_traversal_set_support(const vector& traversals, - int ref_trav_idx) const { - // share everything - vector shared_travs(traversals.size()); - for (int i = 0; i < shared_travs.size(); ++i) { - shared_travs[i] = i; - } - - // get the support of everything, where all shared nodes and edges are scaled by the number of times they're shared - vector supports = get_traversal_set_support(traversals, shared_travs, {}, false, false, true, ref_trav_idx); - - // sum it up - Support total; - for (const Support& support : supports) { - total += support; - } - - return total; -} - vector TraversalSupportFinder::get_traversal_genotype_support(const vector& traversals, const vector& genotype, int ref_trav_idx) { diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp index ff2051cbb5c..42fd10607bb 100644 --- a/src/traversal_support.hpp +++ b/src/traversal_support.hpp @@ -45,13 +45,7 @@ class TraversalSupportFinder { /// Get the support of a traversal /// Child snarls are handled as in the old call code: their maximum support is used virtual Support get_traversal_support(const SnarlTraversal& traversal) const; - - /// wrapper for using get_traversal_set_support to get the total support - /// (sets shared_travs to the whole set, mutual_shared to true, then - /// sums over the results) - virtual Support get_total_traversal_set_support(const vector& traversals, - int ref_trav_idx = -1) const; - + /// wrapper for using get_traversal_set_support to get the support for /// some alleles in a genotype, where everything is split evently among them /// anything not in the genotype gets a support using "exclusive_count" From aad52e4e2c98b39054e1b67a54d9a3ad98d7513d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 12 Nov 2019 15:55:33 -0500 Subject: [PATCH 43/79] turn bin size way down --- src/snarl_caller.cpp | 2 +- src/subcommand/call_main.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index b9f9356d715..6556bbb7bb4 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -589,7 +589,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp } // how many reads would we expect to not map to our genotype due to error - double error_rate = std::min(0.25, depth_err + baseline_mapping_error); + double error_rate = std::min(0.05, depth_err + baseline_mapping_error); double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support); // and our likelihood for the unmapped reads we see: diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index 3f69185b0c9..baae2060516 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -273,7 +273,7 @@ int main_call(int argc, char** argv) { if (ratio_caller == false) { // Make a depth index - depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true); + depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 50, 0, true, true); // Make a new-stype probablistic caller auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index); packed_caller = poisson_caller; From 21fcfafc1120263e61488e71a6eee1b873c8fbde Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 12 Nov 2019 16:57:31 -0500 Subject: [PATCH 44/79] fix vcf header --- src/snarl_caller.cpp | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 6556bbb7bb4..bb55e987755 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -670,10 +670,8 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } // get the genotype likelihoods - // as above, there's some overlap in these computations as those used in genotype() to begin with - // this is an issue with the class interface which probably tries too hard to avoid being VCF-dependent - // but if it causes a slowdown (hasn't seemed to be a factor so far), the code could be re-organized - // to either store some of this information, or comptue the genotype and vcf fields in a single shot + vector gen_likelihoods; + double gen_likelihood; variant.format.push_back("GL"); // expected depth from our coverage @@ -687,12 +685,33 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, for (int i = 0; i < traversals.size(); ++i) { for (int j = i; j < traversals.size(); ++j) { double gl = genotype_likelihood({i, j}, traversals, 0, exp_depth, depth_err); + gen_likelihoods.push_back(gl); + if (vector({i, j}) == genotype || vector({j,i}) == genotype) { + gen_likelihood = gl; + } // convert from natural log to log10 by dividing by ln(10) variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); } } - // use old quality for now + // get the GQ + double prior = log(1. / (double)traversals.size()); + double p_reads = prior + gen_likelihoods[0]; + // note that we should be summing over all the likelihoods as considered in genotype() + // todo: figure out a way how to move this to that method. + // (or make sure more uncalled stuff makes it into the vcf so we have more traversals to sum over here) + /* + for (int i = 1; i < gen_likelihoods.size(); ++i) { + p_reads = add_log(p_reads, prior + gen_likelihoods[i]); + } + double posterior = gen_likelihood + prior - p_reads; + double gq = logprob_to_phred(logprob_invert(posterior)); + variant.format.push_back("GQ"); + variant.samples[sample_name]["GQ"].push_back(std::to_string(gq)); + */ + + // our old min-support based quality as hack until + // qual / gq properly sorted out variant.quality = min_site_support; // Now do the filters @@ -712,6 +731,11 @@ void PoissonSupportSnarlCaller::update_vcf_header(string& header) const { header += "##FORMAT=\n"; header += "##FORMAT=\n"; header += "##FORMAT=\n"; + //header += "##FORMAT=\n"; + header += "##FILTER=\n"; + header += "##FILTER=\n"; } vector PoissonSupportSnarlCaller::rank_by_support(const vector& supports) { From 3e5c17910b147ddb24c48823de7edf7ebe9bf747 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Tue, 12 Nov 2019 14:31:45 -0800 Subject: [PATCH 45/79] Stopped copying seeds --- src/seed_clusterer.cpp | 151 +++++++++++++------------------- src/seed_clusterer.hpp | 34 +++---- src/unittest/seed_clusterer.cpp | 52 +++++++++-- 3 files changed, 124 insertions(+), 113 deletions(-) diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index dd16f19c8c5..d4597a0b31d 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -2,7 +2,7 @@ #include -#define DEBUG_CLUSTER +//#define DEBUG_CLUSTER namespace vg { @@ -10,16 +10,19 @@ namespace vg { dist_index(dist_index){ }; - SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (vector seeds, int64_t read_distance_limit) const { + SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (const vector& seeds, int64_t read_distance_limit) const { + vector> all_seeds; all_seeds.push_back(seeds); + tuple,SnarlSeedClusterer::cluster_group_t> all_clusters = cluster_seeds(all_seeds, read_distance_limit, 0); + return std::get<0>(all_clusters)[0]; }; tuple,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds ( - vector>& all_seeds, int64_t read_distance_limit, + const vector>& all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit) const { /* Given a vector of seeds and a limit, find a clustering of seeds where * seeds that are closer than the limit cluster together. @@ -34,11 +37,9 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl; } //For each level of the snarl tree, maps snarls (index into - //dist_index.snarl_indexes) at that level to - //nodes belonging to the snarl + //dist_index.snarl_indexes) at that level to nodes belonging to the snarl //This is later used to populate snarl_to_node in the tree state - vector>>> - snarl_to_nodes_by_level; + vector>>> snarl_to_nodes_by_level; snarl_to_nodes_by_level.resize(dist_index.tree_depth+1); @@ -55,10 +56,7 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl; get_nodes(tree_state, snarl_to_nodes_by_level); //Initialize the tree state to be the bottom level - if (dist_index.tree_depth >= 0) { - tree_state.snarl_to_nodes = - move(snarl_to_nodes_by_level[dist_index.tree_depth]); - } + tree_state.snarl_to_nodes = std::move(snarl_to_nodes_by_level[dist_index.tree_depth]); for (int depth = dist_index.tree_depth ; depth >= 0 ; depth --) { //Go through each level of the tree, bottom up, and cluster that @@ -67,12 +65,10 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl; // // tree_state knows all children of the snarls at this level + // Bring in the direct child nodes that come in at this level in the snarl tree. + // They only ever occur below the root. if (depth != 0) { - // Bring in the direct child nodes that come in at this level - // in the snarl tree. - // They only ever occur below the root. - tree_state.parent_snarl_to_nodes = - move(snarl_to_nodes_by_level[depth - 1]); + tree_state.parent_snarl_to_nodes = std::move(snarl_to_nodes_by_level[depth - 1]); } #ifdef DEBUG_CLUSTER @@ -90,7 +86,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { cluster_chain_level(tree_state, depth); // Swap buffer over for the next level - tree_state.snarl_to_nodes = move(tree_state.parent_snarl_to_nodes); + tree_state.snarl_to_nodes = std::move(tree_state.parent_snarl_to_nodes); tree_state.chain_to_snarls.clear(); } @@ -143,9 +139,10 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { // Assign each seed to a node. hash_set seen_nodes; for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ - vector& seeds = tree_state.all_seeds->at(read_num); + const vector& seeds = tree_state.all_seeds->at(read_num); for (size_t i = 0; i < seeds.size(); i++) { - id_t id = get_id(seeds.at(i)); + pos_t pos = seeds.at(i); + id_t id = get_id(pos); //Assign the seed to a node tree_state.node_to_seeds[read_num].emplace_back(id, i); @@ -211,13 +208,10 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { assert(snarl_index.parent_id >= dist_index.min_node_id); assert(snarl_index.parent_id <= dist_index.max_node_id); #endif - size_t parent_snarl_i = - dist_index.getPrimaryAssignment( - snarl_index.parent_id); + size_t parent_snarl_i = dist_index.getPrimaryAssignment( snarl_index.parent_id); tree_state.parent_snarl_to_nodes[parent_snarl_i].emplace_back( - NetgraphNode (snarl_i, SNARL), - cluster_one_snarl(tree_state, snarl_i)); + NetgraphNode (snarl_i, SNARL), cluster_one_snarl(tree_state, snarl_i)); #ifdef DEBUG_CLUSTER cerr << "Recording snarl number " << snarl_i @@ -262,10 +256,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { // Find the node ID that heads the parent of that chain. size_t parent_id = dist_index.chain_indexes[chain_i].parent_id; // It must be a legitimate node ID we cover. -#ifdef DEBUG_CLUSTER assert(parent_id >= dist_index.min_node_id); assert(parent_id <= dist_index.max_node_id); -#endif + // Map it to the snarl number that should be represented by it // (and thus also contain the chain) size_t parent_snarl_i = dist_index.getPrimaryAssignment(parent_id); @@ -318,10 +311,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //ends of the node pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; - int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) - : get_offset(seed) + 1; - int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 - : node_length - get_offset(seed); + int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) : get_offset(seed) + 1; + int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 : node_length - get_offset(seed); node_clusters.read_best_left[read_num] = min_not_minus_one(dist_left, node_clusters.read_best_left[read_num]); @@ -334,8 +325,11 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { tree_state.read_union_find[read_num].union_groups(group_id, iter->second); if (tree_state.fragment_distance_limit != 0 ) { - if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; - tree_state.fragment_union_find.union_groups(fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]); + if (fragment_group_id == -1 ) { + fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num]; + } + tree_state.fragment_union_find.union_groups( + fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]); } } @@ -352,7 +346,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { } } #ifdef DEBUG_CLUSTER - cerr << "Found single cluster on node " << node_id << " with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl; + cerr << "Found single cluster on node " << node_id << " with fragment dists " + << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl; bool got_left = false; bool got_right = false; @@ -587,7 +582,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { combined_left[read_num] = min_not_minus_one(combined_left[read_num], dists.first); combined_right[read_num] = min_not_minus_one(combined_right[read_num], dists.second); } - cerr << "COMBINING READ: " ; if (tree_state.fragment_distance_limit != 0) { if (fragment_combined_group != -1) { tree_state.fragment_union_find.union_groups(fragment_combined_group, @@ -595,7 +589,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { } fragment_combined_group = tree_state.fragment_union_find.find_group( cluster_group+tree_state.read_index_offsets[read_num]); - cerr << " AND FRAGMENT" << endl; } return true; } else if (fragment_dist != -1 && @@ -1208,16 +1201,12 @@ cerr << " Maybe combining this cluster from the right" << endl; //Update distances and cluster head of new cluster size_t new_g = tree_state.read_union_find[read_num].find_group(new_group); - if (new_g != new_group) { - snarl_clusters.read_cluster_heads.erase(make_pair(read_num,new_group)); - } - if (new_g != combined_group) { - snarl_clusters.read_cluster_heads.erase(make_pair(read_num,combined_group)); - } + if (new_g != new_group) snarl_clusters.read_cluster_heads.erase(make_pair(read_num,new_group)); + if (new_g != combined_group) snarl_clusters.read_cluster_heads.erase(make_pair(read_num,combined_group)); + snarl_clusters.read_cluster_heads.emplace(read_num,new_g); - end_dists = make_pair( - min_not_minus_one(end_dists.first, old_dists.first), - min_not_minus_one(end_dists.second, old_dists.second)); + end_dists = make_pair( min_not_minus_one(end_dists.first, old_dists.first), + min_not_minus_one(end_dists.second, old_dists.second)); tree_state.read_cluster_dists[read_num][new_g] = end_dists; new_group = new_g; combined_group = new_g; @@ -1239,8 +1228,7 @@ cerr << " Maybe combining this cluster from the right" << endl; //Get the children of this snarl and their clusters - vector>& child_nodes = - tree_state.snarl_to_nodes[snarl_index_i]; + vector>& child_nodes = tree_state.snarl_to_nodes[snarl_index_i]; int64_t start_length = snarl_index.nodeLength(0); int64_t end_length = snarl_index.nodeLength(snarl_index.num_nodes*2 -1); @@ -1250,7 +1238,7 @@ cerr << " Maybe combining this cluster from the right" << endl; hash_map, pair> old_dists; for (size_t i = 0; i < child_nodes.size() ; i++) { - //Go through each child node of the netgraph and get clusters + //Go through each child node of the netgraph NetgraphNode& child = child_nodes [i].first; @@ -1259,18 +1247,15 @@ cerr << " Maybe combining this cluster from the right" << endl; id_t child_node_id = child.id_in_parent(dist_index); //Rank of this node in the snarl - //If this node is a snarl/chain, then this snarl will be the - //secondary snarl + //Note, if this node is a snarl/chain, then this snarl will be the secondary snarl size_t node_rank = child.rank_in_parent(dist_index, child_node_id); - size_t rev_rank = node_rank % 2 == 0 - ? node_rank + 1 : node_rank - 1; + size_t rev_rank = node_rank % 2 == 0 ? node_rank + 1 : node_rank - 1; if (child.node_type == NODE) { //If this node is a node, we need to find the clusters int64_t node_len = snarl_index.nodeLength(node_rank); - child_nodes[i].second = cluster_one_node( - tree_state, child_node_id, node_len); + child_nodes[i].second = cluster_one_node(tree_state, child_node_id, node_len); } //Represents all the clusters on this child node @@ -1343,10 +1328,8 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa id_t other_node_id = other_node.id_in_parent(dist_index); //Rank of this node in the snarl - size_t other_rank = other_node.rank_in_parent(dist_index, - other_node_id); - size_t other_rev = other_rank % 2 == 0 - ? other_rank + 1 : other_rank - 1; + size_t other_rank = other_node.rank_in_parent(dist_index, other_node_id); + size_t other_rev = other_rank % 2 == 0 ? other_rank + 1 : other_rank - 1; #ifdef DEBUG_CLUSTER cerr << "Other net graph node is " << typeToString(other_node.node_type) @@ -1358,14 +1341,11 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa //Find distance from each end of current node (i) to //each end of other node (j) - int64_t dist_l_l = snarl_index.snarlDistance( - rev_rank, other_rank); - int64_t dist_l_r = snarl_index.snarlDistance( - rev_rank, other_rev); - int64_t dist_r_l = snarl_index.snarlDistance( - node_rank, other_rank); - int64_t dist_r_r = snarl_index.snarlDistance( - node_rank, other_rev); + int64_t dist_l_l = snarl_index.snarlDistance(rev_rank, other_rank); + int64_t dist_l_r = snarl_index.snarlDistance(rev_rank, other_rev); + int64_t dist_r_l = snarl_index.snarlDistance(node_rank, other_rank); + int64_t dist_r_r = snarl_index.snarlDistance(node_rank, other_rev); + #ifdef DEBUG_CLUSTER cerr << "\t distances between ranks " << node_rank << " and " << other_rank << ": " << dist_l_l << " " << dist_l_r << " " << dist_r_l << " " @@ -1385,13 +1365,11 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank if (max({dist_l_l, dist_l_r, dist_r_l, dist_r_r}) != -1 && ((tree_state.fragment_distance_limit == 0 && - MinimumDistanceIndex::minPos({dist_l_l, dist_l_r, - dist_r_l, dist_r_r})-2 <= tree_state.read_distance_limit + MinimumDistanceIndex::minPos({dist_l_l, dist_l_r, dist_r_l, dist_r_r})-2 <= tree_state.read_distance_limit && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2 <= tree_state.read_distance_limit) || (tree_state.fragment_distance_limit != 0 && - MinimumDistanceIndex::minPos({dist_l_l, dist_l_r, - dist_r_l, dist_r_r})-2 <= tree_state.fragment_distance_limit + MinimumDistanceIndex::minPos({dist_l_l, dist_l_r,dist_r_l, dist_r_r})-2 <= tree_state.fragment_distance_limit && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2 <= tree_state.fragment_distance_limit) )) { @@ -1405,8 +1383,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank pair dists_c = old_dists[child_cluster_head]; - if (dist_l_l != -1 && dists_c.first != -1 - && other_node_clusters.fragment_best_left != -1 ) { + if (dist_l_l != -1 && dists_c.first != -1 && other_node_clusters.fragment_best_left != -1 ) { //If cluster child_cluster_head can be combined with clusters in j //from the left of both of them int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 : @@ -1416,8 +1393,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank fragment_dist, read_dist, read_num); } - if (dist_l_r != -1 && dists_c.first != -1 - && other_node_clusters.fragment_best_right != -1 ) { + if (dist_l_r != -1 && dists_c.first != -1 && other_node_clusters.fragment_best_right != -1 ) { //If it can be combined from the left to the right of j int64_t fragment_dist = dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1; int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 : @@ -1425,16 +1401,14 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r, fragment_dist, read_dist, read_num); } - if (dist_r_l != -1 && dists_c.second != -1 - && other_node_clusters.fragment_best_left != -1 ) { + if (dist_r_l != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_left != -1 ) { int64_t fragment_dist = dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1; int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 : dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1; combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l, fragment_dist, read_dist, read_num); } - if (dist_r_r != -1 && dists_c.second != -1 - && other_node_clusters.fragment_best_right != -1 ) { + if (dist_r_r != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_right != -1 ) { int64_t fragment_dist = dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1; int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 : dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1; @@ -1443,17 +1417,16 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank } } - //Go through children of j + + //Go through clusters of child node j vector> children_j( make_move_iterator(other_node_clusters.read_cluster_heads.begin()), make_move_iterator(other_node_clusters.read_cluster_heads.end())); for (size_t k_i = 0 ; k_i < children_j.size() ; k_i++){ - //For each cluster of child j, find which overlaps with - //clusters of i - //child_cluster_head will already be part of a cluster in - //snarlcluster heads but since we need to know the node - //that the snarl is on we can't just loop through + //For each cluster of child j, find which overlaps with clusters of i + //child_cluster_head will already be part of a cluster in snarl_cluster_heads but + //since we need to know the node that the snarl is on we can't just loop through //snarl_cluster heads pair child_cluster_head = children_j[k_i]; size_t read_num = child_cluster_head.first; @@ -1461,8 +1434,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank size_t k_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second); - if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1 - && dists_k.first != -1 ){ + if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1 && dists_k.first != -1 ){ int64_t fragment_dist = dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1; int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 : @@ -1470,8 +1442,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l, fragment_dist,read_dist, read_num); } - if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1 - && dists_k.second != -1 ) { + if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1 && dists_k.second != -1 ) { int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 : dist_l_r + curr_child_clusters.read_best_left[read_num] + dists_k.second-1; @@ -1479,8 +1450,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r, fragment_dist, read_dist, read_num); } - if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1 - && dists_k.first != -1 ) { + if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1 && dists_k.first != -1 ) { int64_t fragment_dist = dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1; int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 : @@ -1488,8 +1458,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l, fragment_dist, read_dist, read_num); } - if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1 - && dists_k.second != -1 ) { + if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1 && dists_k.second != -1 ) { int64_t fragment_dist = dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1; int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 : diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp index e35f877d4a7..fd07b696368 100644 --- a/src/seed_clusterer.hpp +++ b/src/seed_clusterer.hpp @@ -14,7 +14,7 @@ class SnarlSeedClusterer { SnarlSeedClusterer(MinimumDistanceIndex& dist_index); - //Represents all clusters for one vector of seeds + //Represents all clusters for one vector of seeds (corresponding to a read) //Each cluster is a vector of indexes into the vector of seeds typedef vector> cluster_group_t; @@ -22,7 +22,7 @@ class SnarlSeedClusterer { //cluster the seeds such that two seeds whose minimum distance //between them (including both of the positions) is less than // the distance limit are in the same cluster - cluster_group_t cluster_seeds ( vector seeds, int64_t read_distance_limit) const; + cluster_group_t cluster_seeds ( const vector& seeds, int64_t read_distance_limit) const; ///The same thing, but for paired end reads. //Given seeds from multiple reads of a fragment, cluster each read @@ -33,7 +33,7 @@ class SnarlSeedClusterer { //The fragment clusters give seeds the index they would get if the vectors of // seeds were appended to each other in the order given tuple, cluster_group_t> cluster_seeds ( - vector>& all_seeds, + const vector>& all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit=0) const; private: @@ -57,7 +57,9 @@ class SnarlSeedClusterer { } struct NetgraphNode { - //child nodes of a snarl's netgraph + //Represents a child node of a snarl's netgraph + + //node_id is the node id if the node is just a node, index into //dist_index's snarl_indexes/chain_index if it is a snarl/chain size_t node_id; @@ -87,9 +89,8 @@ class SnarlSeedClusterer { //Get the forward rank of this node in the parent's netgraph //to look up distances - size_t rank = node_type == NODE ? - dist_index.getPrimaryRank(id) : - dist_index.getSecondaryRank(id); + size_t rank = node_type == NODE ? dist_index.getPrimaryRank(id) : + dist_index.getSecondaryRank(id); if ( (node_type == SNARL && dist_index.snarl_indexes[dist_index.getPrimaryAssignment(id)].rev_in_parent) || (node_type == CHAIN && @@ -101,7 +102,7 @@ class SnarlSeedClusterer { }; struct NodeClusters { - //Clusters in the context of a snarl tree node + //All clusters of a snarl tree node //The node containing this struct may be an actual node, // snarl/chain that is a node the parent snarl's netgraph, // or a snarl in a chain @@ -134,7 +135,7 @@ class SnarlSeedClusterer { //is updated to know about its children //Vector of all the seeds for each read - vector>* all_seeds; + const vector>* all_seeds; //prefix sum vector of the number of seeds per read //To get the index of a seed for the fragment clusters @@ -171,8 +172,7 @@ class SnarlSeedClusterer { //Map from snarl (index into dist_index.snarl_indexes) i //to the netgraph nodes contained in the snarl as well as the //clusters at the node - hash_map>> - snarl_to_nodes; + hash_map>> snarl_to_nodes; //Map each chain to the snarls (only ones that contain seeds) that //comprise it. @@ -181,19 +181,17 @@ class SnarlSeedClusterer { //Map maps the rank of the snarl to the snarl and snarl's clusters // Since maps are ordered, it will be in the order of traversal // of the snarls in the chain - hash_map>> - chain_to_snarls; + hash_map>> chain_to_snarls; //Same structure as snarl_to_nodes but for the level of the snarl //tree above the current one //This gets updated as the current level is processed - hash_map>> - parent_snarl_to_nodes; + hash_map>> parent_snarl_to_nodes; //Constructor takes in a pointer to the seeds, the distance limits, and //the total number of seeds in all_seeds - TreeState (vector>* all_seeds, int64_t read_distance_limit, + TreeState (const vector>* all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit, size_t seed_count) : all_seeds(all_seeds), read_distance_limit(read_distance_limit), @@ -201,11 +199,13 @@ class SnarlSeedClusterer { fragment_union_find (seed_count, false), read_index_offsets(1,0){ - for (vector& v : *all_seeds) { + for (size_t i = 0 ; i < all_seeds->size() ; i++) { + const vector& v = all_seeds->at(i); size_t offset = read_index_offsets.back() + v.size(); read_index_offsets.push_back(offset); read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1)); node_to_seeds.emplace_back(); + node_to_seeds.back().reserve(v.size()); read_union_find.emplace_back(v.size(), false); } } diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp index e6fec99233c..8c4332d9d10 100644 --- a/src/unittest/seed_clusterer.cpp +++ b/src/unittest/seed_clusterer.cpp @@ -823,6 +823,49 @@ namespace unittest { } }//end test case + + /* + TEST_CASE("Load graph", "[cluster]"){ + + ifstream vg_stream("testGraph"); + VG vg(vg_stream); + vg_stream.close(); + CactusSnarlFinder bubble_finder(vg); + SnarlManager snarl_manager = bubble_finder.find_snarls(); + + MinimumDistanceIndex dist_index (&vg, &snarl_manager); + SnarlSeedClusterer clusterer(dist_index); + + int64_t read_lim = 20;// Distance between read clusters + int64_t fragment_lim = 30;// Distance between fragment clusters + + vector> all_seeds; + all_seeds.emplace_back(); + all_seeds.emplace_back(); + + + all_seeds[0].push_back(make_pos_t(206, true, 9)); + all_seeds[0].push_back(make_pos_t(277, false, 1)); + all_seeds[0].push_back(make_pos_t(263, true, 11)); + all_seeds[0].push_back(make_pos_t(280, false, 10)); + all_seeds[0].push_back(make_pos_t(279, true, 3)); + all_seeds[0].push_back(make_pos_t(282, false, 0)); + all_seeds[0].push_back(make_pos_t(300, false, 0)); + all_seeds[0].push_back(make_pos_t(248, false, 0)); + all_seeds[0].push_back(make_pos_t(245, false, 0)); + all_seeds[0].push_back(make_pos_t(248, true, 0)); + + tuple>>, vector>> paired_clusters = + clusterer.cluster_seeds(all_seeds, read_lim, fragment_lim); + vector>> read_clusters = std::get<0>(paired_clusters); + vector> fragment_clusters = std::get<1>(paired_clusters); + cerr << "read cluster: " << read_clusters[0].size() << endl << "fragment clusters: " << fragment_clusters.size() << endl; + + REQUIRE(fragment_clusters.size() == 2); + REQUIRE((fragment_clusters[0].size() == 4 || + fragment_clusters[1].size() == 4)); + }//end test case + */ TEST_CASE("Random graphs", "[cluster]"){ for (int i = 0; i < 1000; i++) { @@ -846,13 +889,14 @@ namespace unittest { uniform_int_distribution randSnarlIndex(0, allSnarls.size()-1); default_random_engine generator(time(NULL)); for (size_t k = 0; k < 1000 ; k++) { + vector> all_seeds; all_seeds.emplace_back(); all_seeds.emplace_back(); - int64_t read_lim = 20;// Distance between read clusters + int64_t read_lim = 15;// Distance between read clusters int64_t fragment_lim = 30;// Distance between fragment clusters for (size_t read = 0 ; read < 2 ; read ++) { - for (int j = 0; j < 20; j++) { + for (int j = 0; j < 200; j++) { //Check clusters of j random positions const Snarl* snarl1 = allSnarls[randSnarlIndex(generator)]; @@ -899,9 +943,7 @@ namespace unittest { for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { pos_t pos1 = all_seeds[read_num][clust[i1]]; size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false)); - pos_t rev1 = make_pos_t(get_id(pos1), - !is_rev(pos1), - len1 - get_offset(pos1)-1); + pos_t rev1 = make_pos_t(get_id(pos1), !is_rev(pos1),len1 - get_offset(pos1)-1); for (size_t b = 0 ; b < one_read_clusters.size() ; b++) { if (b != a) { From fc2cf1664beae36058a46e512f1c51551f72dd4e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 12 Nov 2019 21:44:28 -0500 Subject: [PATCH 46/79] activate pruning in poisson caller. also use iterative pruning if first try doesnt meet cutoff --- src/snarl_caller.cpp | 20 ++++++++++---------- src/snarl_caller.hpp | 17 ++++++++--------- src/traversal_finder.cpp | 6 +++--- src/traversal_finder.hpp | 10 ++++++++-- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index bb55e987755..0d307fc19a2 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -8,9 +8,9 @@ namespace vg { SnarlCaller::~SnarlCaller() { } -function SnarlCaller::get_skip_allele_fn() const { +function SnarlCaller::get_skip_allele_fn() const { // default implementation says don't skip anything - return [](const SnarlTraversal&) { return false; }; + return [](const SnarlTraversal&, int) { assert(false); return false; }; } SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, @@ -65,6 +65,14 @@ int SupportBasedSnarlCaller::get_best_support(const vector& supports, c return best_allele; } +function SupportBasedSnarlCaller::get_skip_allele_fn() const { + // port over cutoff used in old support caller (there avg support used all the time, here + // we use the same toggles as when genotyping) + return [&](const SnarlTraversal& trav, int iteration) -> bool { + return support_val(support_finder.get_traversal_support(trav)) < pow(2, iteration) * min_alt_path_support; + }; +} + RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, TraversalSupportFinder& support_finder) : SupportBasedSnarlCaller(graph, snarl_manager, support_finder) { @@ -404,14 +412,6 @@ void RatioSupportSnarlCaller::update_vcf_header(string& header) const { std::to_string(min_site_depth) + "\">\n"; } -function RatioSupportSnarlCaller::get_skip_allele_fn() const { - // port over cutoff used in old support caller (there avg support used all the time, here - // we use the same toggles as when genotyping) - return [&](const SnarlTraversal& trav) -> bool { - return support_val(support_finder.get_traversal_support(trav)) < min_alt_path_support; - }; -} - double RatioSupportSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, int second_best_trav, int ref_trav_idx) const { bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 62f18563f56..d88c2bcd4d5 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -51,7 +51,7 @@ class SnarlCaller { virtual void update_vcf_header(string& header) const = 0; /// Optional method used for pruning searches - virtual function get_skip_allele_fn() const; + virtual function get_skip_allele_fn() const; }; /** @@ -82,6 +82,9 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// Get the minimum total support for call virtual int get_min_total_support_for_call() const; + /// Use min_alt_path_support threshold as cutoff + virtual function get_skip_allele_fn() const; + protected: /// Get the best support out of a list of supports, ignoring skips @@ -107,6 +110,9 @@ class SupportBasedSnarlCaller : public SnarlCaller { /// what's the minimum total support (over all alleles) of the site to make /// a call size_t min_site_depth = 3; + /// used only for pruning alleles in the VCFTraversalFinder: minimum support + /// of an allele's alt-path for it to be considered in the brute-force enumeration + double min_alt_path_support = 0.2; }; @@ -141,9 +147,6 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { /// Define any header fields needed by the above virtual void update_vcf_header(string& header) const; - /// Use min_alt_path_support threshold as cutoff - virtual function get_skip_allele_fn() const; - protected: /// Get the bias used to for comparing two traversals @@ -171,11 +174,7 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { /// the reference, the call is made. set to 0 to deactivate. double max_ma_bias = 0; /// what's the min log likelihood for allele depth assignments to PASS? - double min_ad_log_likelihood_for_filter = -9; - /// used only for pruning alleles in the VCFTraversalFinder: minimum support - /// of an allele's alt-path for it to be considered in the brute-force enumeration - double min_alt_path_support = 0.2; - + double min_ad_log_likelihood_for_filter = -9; }; /** diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index 540bcdf54f7..74f40700101 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -2546,7 +2546,7 @@ VCFTraversalFinder::VCFTraversalFinder(const PathHandleGraph& graph, SnarlManage const vector& ref_path_names, FastaReference* ref_fasta, FastaReference* ins_fasta, - function skip_alt, + function skip_alt, size_t max_traversal_cutoff) : graph(graph), snarl_manager(snarl_manager), @@ -3248,7 +3248,7 @@ vector> VCFTraversalFinder::get_pruned_alt_alleles( } // only invoke pruning if we exceed our cutoff. fairly rare on most graphs - if (!check_max_trav_cutoff(alt_alleles)) { + for (int prune_it = 0; prune_it < max_prune_iterations && !check_max_trav_cutoff(alt_alleles); ++prune_it) { for (auto& alleles : alt_alleles) { alleles.clear(); } @@ -3256,7 +3256,7 @@ vector> VCFTraversalFinder::get_pruned_alt_alleles( for (int var_i = 0; var_i < site_variants.size(); ++var_i) { for (int allele = 0; allele < site_variants[var_i]->alleles.size(); ++allele) { if (skip_alt == nullptr || - skip_alt(get_alt_path(site_variants[var_i], allele, ref_path).first) == false) { + skip_alt(get_alt_path(site_variants[var_i], allele, ref_path).first, prune_it) == false) { alt_alleles[var_i].push_back(allele); } #ifdef debug diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index 4257ce58e13..573c17155c9 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -416,13 +416,19 @@ class VCFTraversalFinder : public TraversalFinder { /// Use this method to prune the search space by selecting alt-alleles /// to skip by considering their paths (in SnarlTraversal) format - function skip_alt; + /// It will try again and again until enough traversals are pruned, + /// with iteration keeping track of how many tries (so it should become stricter + /// as iteration increases) + function skip_alt; /// If a snarl has more than this many traversals, return nothing and print /// a warning. Dense and large deletions will make this happen from time /// to time. In practice, skip_alt (above) can be used to prune down /// the search space by selecting alleles to ignore. size_t max_traversal_cutoff; + + /// Maximum number of pruning iterations + size_t max_prune_iterations = 1000; /// Include snarl endpoints in traversals bool include_endpoints = true; @@ -446,7 +452,7 @@ class VCFTraversalFinder : public TraversalFinder { const vector& ref_path_names = {}, FastaReference* fasta_ref = nullptr, FastaReference* ins_ref = nullptr, - function skip_alt = nullptr, + function skip_alt = nullptr, size_t max_traversal_cutoff = 500000); virtual ~VCFTraversalFinder(); From af07aad51963597186f6520993b3edc820022154 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Wed, 13 Nov 2019 11:01:33 -0800 Subject: [PATCH 47/79] Fixed comments, weird indentations, etc --- scripts/giraffe-wrangler.sh | 2 +- src/seed_clusterer.cpp | 126 +++++++++++++++----------------- src/seed_clusterer.hpp | 19 +++-- src/subcommand/gaffe_main.cpp | 4 +- src/unittest/seed_clusterer.cpp | 2 +- 5 files changed, 76 insertions(+), 77 deletions(-) diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh index 5dfaedb3083..20f752148d9 100755 --- a/scripts/giraffe-wrangler.sh +++ b/scripts/giraffe-wrangler.sh @@ -91,7 +91,7 @@ echo "${SIM_GAM}" echo "${REAL_FASTQ}" # Define the Giraffe parameters -GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 150 -a 4 -s 50 -u 0.4 -v 1 -w 20) +GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 300 -a 6 -s 50 -u 0.4 -v 1 -w 20) # Define a work directory # TODO: this requires GNU mptemp diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp index d4597a0b31d..16a6a9ae7a2 100644 --- a/src/seed_clusterer.cpp +++ b/src/seed_clusterer.cpp @@ -11,9 +11,10 @@ namespace vg { }; SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (const vector& seeds, int64_t read_distance_limit) const { + //Wrapper for single ended - vector> all_seeds; - all_seeds.push_back(seeds); + vector*> all_seeds; + all_seeds.push_back(&seeds); tuple,SnarlSeedClusterer::cluster_group_t> all_clusters = cluster_seeds(all_seeds, read_distance_limit, 0); @@ -24,6 +25,16 @@ namespace vg { tuple,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds ( const vector>& all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit) const { + //Wrapper for paired end + vector*> seed_pointers; + seed_pointers.reserve(all_seeds.size()); + for (const vector& v : all_seeds) seed_pointers.push_back(&v); + return cluster_seeds(seed_pointers, read_distance_limit, fragment_distance_limit); + } + + tuple,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds ( + const vector*>& all_seeds, int64_t read_distance_limit, + int64_t fragment_distance_limit) const { /* Given a vector of seeds and a limit, find a clustering of seeds where * seeds that are closer than the limit cluster together. * Returns a vector of cluster assignments @@ -47,7 +58,7 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl; //for a single level of the snarl tree as it is being processed //It also keeps track of the parents of the current level size_t seed_count = 0; - for (auto& v : all_seeds) seed_count+= v.size(); + for (auto v : all_seeds) seed_count+= v->size(); TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit, seed_count); @@ -74,7 +85,7 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl; #ifdef DEBUG_CLUSTER assert(tree_state.read_index_offsets[0] == 0); for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { - assert (tree_state.read_index_offsets[i] + tree_state.all_seeds->at(i).size() == tree_state.read_index_offsets[i+1]); + assert (tree_state.read_index_offsets[i] + tree_state.all_seeds->at(i)->size() == tree_state.read_index_offsets[i+1]); } #endif //Cluster all the snarls at this depth @@ -97,7 +108,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { for (auto group : tree_state.read_union_find[read_num].all_groups()){ cerr << "\t\t"; for (size_t c : group) { - cerr << tree_state.all_seeds->at(read_num)[c] << " "; + cerr << tree_state.all_seeds->at(read_num)->at(c) << " "; } cerr << endl; } @@ -139,9 +150,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { // Assign each seed to a node. hash_set seen_nodes; for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ - const vector& seeds = tree_state.all_seeds->at(read_num); - for (size_t i = 0; i < seeds.size(); i++) { - pos_t pos = seeds.at(i); + const vector* seeds = tree_state.all_seeds->at(read_num); + for (size_t i = 0; i < seeds->size(); i++) { + pos_t pos = seeds->at(i); id_t id = get_id(pos); //Assign the seed to a node @@ -310,7 +321,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //And find the shortest distance from any seed to both //ends of the node - pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; + pos_t seed = tree_state.all_seeds->at(read_num)->at(iter->second); int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) : get_offset(seed) + 1; int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 : node_length - get_offset(seed); @@ -360,9 +371,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { pair dists = tree_state.read_cluster_dists[c.first][c.second]; cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; has_seeds = true; } } @@ -402,7 +413,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) { //For each seed, find its offset - pos_t seed = tree_state.all_seeds->at(read_num)[iter->second]; + pos_t seed = tree_state.all_seeds->at(read_num)->at(iter->second); int64_t offset = is_rev(seed) ? node_length - get_offset(seed) : get_offset(seed) + 1; node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left); @@ -498,9 +509,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { pair dists = tree_state.read_cluster_dists[c.first][c.second]; cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; has_seeds = true; } } @@ -620,8 +631,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //Union the two groups tree_state.read_union_find[read_num].union_groups(combined_group, new_group); //Find the new distances of the combined groups - pair& old_dists = - tree_state.read_cluster_dists[read_num][combined_group]; + pair& old_dists = tree_state.read_cluster_dists[read_num][combined_group]; size_t new_combined_group = tree_state.read_union_find[read_num].find_group(new_group); //Update which groups are being kept track of if (new_combined_group != new_group) { @@ -657,8 +667,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //If these aren't in the same read cluster but are in //the same fragment cluster if (fragment_combined_group != -1) { - tree_state.fragment_union_find.union_groups(fragment_combined_group, - new_group + tree_state.read_index_offsets[read_num]); + tree_state.fragment_union_find.union_groups( + fragment_combined_group, new_group + tree_state.read_index_offsets[read_num]); } fragment_combined_group = tree_state.fragment_union_find.find_group( new_group + tree_state.read_index_offsets[read_num]); @@ -694,8 +704,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //The clusters of the current snarl NodeClusters& snarl_clusters = kv.second.second; - MinimumDistanceIndex::SnarlIndex& snarl_index = - dist_index.snarl_indexes[curr_snarl_i]; + MinimumDistanceIndex::SnarlIndex& snarl_index = dist_index.snarl_indexes[curr_snarl_i]; //Get the lengths of the start and end nodes of the snarl, relative //to the order of the chain @@ -738,10 +747,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { //Distance from the start of chain to the start of the current snarl - int64_t add_dist_left = start_rank == 0 ? 0 : - chain_index.prefix_sum[start_rank] - 1; - - + int64_t add_dist_left = start_rank == 0 ? 0 : chain_index.prefix_sum[start_rank] - 1; //Combine snarl clusters that can be reached by looping int64_t loop_dist_end = chain_index.loop_fd[start_rank + 1] - 1 ; @@ -759,9 +765,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { << endl; cerr << "\t\t"; bool has_seeds = false; - for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; has_seeds = true; } } @@ -779,9 +785,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { cerr << "\tleft: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t"; - for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; } } cerr << endl; @@ -833,10 +839,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) { ? -1 : snarl_dists.first + loop_dist_start + snarl_length - start_length; snarl_dists.second = min_not_minus_one(new_right, snarl_dists.second); - snarl_clusters.fragment_best_right = - min_not_minus_one(snarl_clusters.fragment_best_right, new_right); - snarl_clusters.read_best_right[read_num] = - min_not_minus_one(snarl_clusters.read_best_right[read_num], new_right); + snarl_clusters.fragment_best_right = min_not_minus_one(snarl_clusters.fragment_best_right, new_right); + snarl_clusters.read_best_right[read_num] = min_not_minus_one(snarl_clusters.read_best_right[read_num], new_right); #ifdef DEBUG_CLUSTER cerr << " (Possibly) updating looping distance to right of snarl cluster " << read_num <<":" << cluster_head.second << ": " << new_right << " -> " << snarl_dists.second << endl; @@ -868,8 +872,7 @@ cerr << " Combining this cluster from the left " ; if (snarl_dists.first == -1 || (new_left != -1 && new_left < snarl_dists.first)){ //If this is an improvement, update distances snarl_dists.first = new_left; - snarl_clusters.read_best_left[read_num] = - min_not_minus_one(new_left, snarl_clusters.read_best_left[read_num]); + snarl_clusters.read_best_left[read_num] = min_not_minus_one(new_left, snarl_clusters.read_best_left[read_num]); snarl_clusters.fragment_best_left = min_not_minus_one(new_left, snarl_clusters.fragment_best_left); #ifdef DEBUG_CLUSTER @@ -891,8 +894,7 @@ cerr << " Maybe combining this cluster from the right" << endl; snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1; combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num], - fragment_snarl_cluster_right, to_erase,fragment_dist, - read_dist, snarl_dists, read_num); + fragment_snarl_cluster_right, to_erase,fragment_dist, read_dist, snarl_dists, read_num); } } @@ -900,10 +902,13 @@ cerr << " Maybe combining this cluster from the right" << endl; //existing chain clusters int64_t read_dist = read_chain_right[read_num] == -1 || snarl_dists.first == -1 ? -1 : snarl_dists.first + read_chain_right[read_num] - start_length-1; + int64_t fragment_dist = tree_state.fragment_distance_limit == 0 || fragment_chain_right == -1 || snarl_dists.first == -1 ? -1 : snarl_dists.first+fragment_chain_right-start_length-1; + pair new_snarl_dists (snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left, snarl_dists.second); + bool combined_read = combine_chain_clusters (cluster_head.second,combined_cluster, fragment_combined_cluster, combined_left, combined_right, new_snarl_dists, to_erase, fragment_dist, read_dist, read_num); @@ -915,10 +920,8 @@ cerr << " Maybe combining this cluster from the right" << endl; snarl_dists.second); chain_clusters.fragment_best_left = min_not_minus_one(chain_clusters.fragment_best_left,d.first); chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right,d.second); - chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], - d.first); - chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], - d.second); + chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], d.first); + chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], d.second); tree_state.read_cluster_dists[read_num][cluster_head.second] = std::move(d); } @@ -997,9 +1000,9 @@ cerr << " Maybe combining this cluster from the right" << endl; cerr << "\t\tleft: " << dists.first << " right : " << dists.second << endl; cerr << "\t\t\t"; bool has_seeds = false; - for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; has_seeds = true; } } @@ -1046,11 +1049,9 @@ cerr << " Maybe combining this cluster from the right" << endl; pair& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second]; if ((chain_dists.second != -1 && chain_clusters.read_best_left[read_num] != -1 && - chain_dists.second + chain_clusters.read_best_left[read_num] - first_length - 1 - <= tree_state.read_distance_limit) || + chain_dists.second + chain_clusters.read_best_left[read_num] - first_length - 1 <= tree_state.read_distance_limit) || (chain_dists.first != -1 && chain_clusters.read_best_right[read_num] != -1 && - chain_dists.first + chain_clusters.read_best_right[read_num] - first_length - 1 - <= tree_state.read_distance_limit)){ + chain_dists.first + chain_clusters.read_best_right[read_num] - first_length - 1 <= tree_state.read_distance_limit)){ //If this chain cluster is in the combined cluster if (combined_cluster[read_num] == -1) { combined_cluster[read_num] = cluster_head.second; @@ -1076,11 +1077,9 @@ cerr << " Maybe combining this cluster from the right" << endl; } } else if (tree_state.fragment_distance_limit != 0 && ((chain_dists.second != -1 && chain_clusters.fragment_best_left != -1 && - chain_dists.second + chain_clusters.fragment_best_left - first_length - 1 - <= tree_state.fragment_distance_limit) || + chain_dists.second + chain_clusters.fragment_best_left - first_length - 1 <= tree_state.fragment_distance_limit) || (chain_dists.first != -1 && chain_clusters.fragment_best_right != -1 && - chain_dists.first + chain_clusters.fragment_best_right - first_length - 1 - <= tree_state.fragment_distance_limit))){ + chain_dists.first + chain_clusters.fragment_best_right - first_length - 1 <= tree_state.fragment_distance_limit))){ //If we can cluster by fragment if (fragment_combined_cluster != -1) { tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]); @@ -1116,9 +1115,9 @@ cerr << " Maybe combining this cluster from the right" << endl; pair dists = tree_state.read_cluster_dists[c.first][c.second]; cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; has_seeds = true; } } @@ -1159,8 +1158,7 @@ cerr << " Maybe combining this cluster from the right" << endl; TreeState& tree_state, size_t snarl_index_i) const { /*Get the clusters on this snarl. * Nodes have not yet been clustered */ - MinimumDistanceIndex::SnarlIndex& snarl_index = - dist_index.snarl_indexes[snarl_index_i]; + MinimumDistanceIndex::SnarlIndex& snarl_index = dist_index.snarl_indexes[snarl_index_i]; #ifdef DEBUG_CLUSTER cerr << "Finding clusters on snarl number " << snarl_index_i << " headed by node " << snarl_index.id_in_parent << endl; @@ -1169,8 +1167,7 @@ cerr << " Maybe combining this cluster from the right" << endl; //Keep track of all clusters on this snarl NodeClusters snarl_clusters(tree_state.all_seeds->size()); - auto combine_clusters = [&] (size_t& new_group, size_t& combined_group, - size_t& fragment_combined_group, + auto combine_clusters = [&] (size_t& new_group, size_t& combined_group, size_t& fragment_combined_group, int64_t fragment_dist, int64_t read_dist, size_t read_num){ //Helper function to compare and combine clusters in two nodes of the same snarl //If the distance between two clusters is small enough, then combine them @@ -1273,9 +1270,9 @@ cerr << " Maybe combining this cluster from the right" << endl; cerr << "\tdist left: " << tree_state.read_cluster_dists[c.first][c.second].first << " dist right: " << tree_state.read_cluster_dists[c.first][c.second].second << endl; cerr << "\t\t"; - for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; } } cerr << endl; @@ -1304,10 +1301,8 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa << " : " << new_dists.first << " " << new_dists.second << endl; #endif - snarl_clusters.fragment_best_left =min_not_minus_one( - snarl_clusters.fragment_best_left,new_dists.first); - snarl_clusters.fragment_best_right = min_not_minus_one( - snarl_clusters.fragment_best_right, new_dists.second); + snarl_clusters.fragment_best_left =min_not_minus_one( snarl_clusters.fragment_best_left,new_dists.first); + snarl_clusters.fragment_best_right = min_not_minus_one(snarl_clusters.fragment_best_right, new_dists.second); snarl_clusters.read_best_left[child_cluster_head.first] =min_not_minus_one( snarl_clusters.read_best_left[child_cluster_head.first], new_dists.first); snarl_clusters.read_best_right[child_cluster_head.first] = min_not_minus_one( @@ -1463,8 +1458,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank int64_t fragment_dist = dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1; int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 : dist_r_r + curr_child_clusters.read_best_right[read_num] + dists_k.second-1; - combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r, - fragment_dist, read_dist, read_num); + combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r, fragment_dist, read_dist, read_num); } } } @@ -1488,9 +1482,9 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank pair dists = tree_state.read_cluster_dists[c.first][c.second]; cerr << "\t" << c.first << ":"<at(c.first).size() ; x++) { + for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) { if (tree_state.read_union_find[c.first].find_group(x) == c.second) { - cerr << tree_state.all_seeds->at(c.first)[x] << " "; + cerr << tree_state.all_seeds->at(c.first)->at(x) << " "; has_seeds = true; } } diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp index fd07b696368..6971080a965 100644 --- a/src/seed_clusterer.hpp +++ b/src/seed_clusterer.hpp @@ -38,6 +38,11 @@ class SnarlSeedClusterer { private: + //Actual clustering function that takes a vector of pointers to seeds + tuple, cluster_group_t> cluster_seeds ( + const vector*>& all_seeds, + int64_t read_distance_limit, int64_t fragment_distance_limit=0) const; + MinimumDistanceIndex& dist_index; enum ChildNodeType {CHAIN, SNARL, NODE}; @@ -135,7 +140,7 @@ class SnarlSeedClusterer { //is updated to know about its children //Vector of all the seeds for each read - const vector>* all_seeds; + const vector*>* all_seeds; //prefix sum vector of the number of seeds per read //To get the index of a seed for the fragment clusters @@ -191,7 +196,7 @@ class SnarlSeedClusterer { //Constructor takes in a pointer to the seeds, the distance limits, and //the total number of seeds in all_seeds - TreeState (const vector>* all_seeds, int64_t read_distance_limit, + TreeState (const vector*>* all_seeds, int64_t read_distance_limit, int64_t fragment_distance_limit, size_t seed_count) : all_seeds(all_seeds), read_distance_limit(read_distance_limit), @@ -200,13 +205,13 @@ class SnarlSeedClusterer { read_index_offsets(1,0){ for (size_t i = 0 ; i < all_seeds->size() ; i++) { - const vector& v = all_seeds->at(i); - size_t offset = read_index_offsets.back() + v.size(); + size_t size = all_seeds->at(i)->size(); + size_t offset = read_index_offsets.back() + size; read_index_offsets.push_back(offset); - read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1)); + read_cluster_dists.emplace_back(size, make_pair(-1,-1)); node_to_seeds.emplace_back(); - node_to_seeds.back().reserve(v.size()); - read_union_find.emplace_back(v.size(), false); + node_to_seeds.back().reserve(size); + read_union_find.emplace_back(size, false); } } }; diff --git a/src/subcommand/gaffe_main.cpp b/src/subcommand/gaffe_main.cpp index fd882eaedba..3de30c6b183 100644 --- a/src/subcommand/gaffe_main.cpp +++ b/src/subcommand/gaffe_main.cpp @@ -358,9 +358,9 @@ int main_gaffe(int argc, char** argv) { // How many mappings per read can we emit? Range max_multimaps = 1; // How many clusters should we extend? - Range max_extensions = 150; + Range max_extensions = 300; // How many extended clusters should we align, max? - Range max_alignments = 4; + Range max_alignments = 6; //Throw away cluster with scores that are this amount below the best Range cluster_score = 50; //Throw away clusters with coverage this amount below the best diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp index 8c4332d9d10..38e031dd75e 100644 --- a/src/unittest/seed_clusterer.cpp +++ b/src/unittest/seed_clusterer.cpp @@ -868,7 +868,7 @@ namespace unittest { */ TEST_CASE("Random graphs", "[cluster]"){ - for (int i = 0; i < 1000; i++) { + for (int i = 0; i < 0; i++) { // For each random graph VG graph; random_graph(1000, 20, 100, &graph); From efb8eed3a8fca8c1078ea750fd206dd3d66f8a55 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 13 Nov 2019 15:53:46 -0500 Subject: [PATCH 48/79] ratchet down vcf allele serach space --- src/snarl_caller.cpp | 6 ++++++ src/snarl_caller.hpp | 2 +- src/traversal_finder.hpp | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 0d307fc19a2..c7567a90439 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -496,6 +496,9 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, if (skips.count(best_allele)) { continue; } + if (support_val(supports[best_allele]) < min_total_support_for_call) { + break; + } if (ploidy == 1) { candidates.insert({best_allele}); @@ -524,6 +527,9 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, size_t sec_count = 0; for (int j = 0; j < ranked_secondary_traversals.size() && sec_count < top_k; ++j) { int second_best_allele = ranked_secondary_traversals[j]; + if (support_val(secondary_supports[second_best_allele]) < min_total_support_for_call) { + break; + } if (!skips.count(second_best_allele) && second_best_allele != best_allele) { // canonical ordering for our set candidates.insert({min(best_allele, second_best_allele), max(best_allele, second_best_allele)}); diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index d88c2bcd4d5..1be11053721 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -112,7 +112,7 @@ class SupportBasedSnarlCaller : public SnarlCaller { size_t min_site_depth = 3; /// used only for pruning alleles in the VCFTraversalFinder: minimum support /// of an allele's alt-path for it to be considered in the brute-force enumeration - double min_alt_path_support = 0.2; + double min_alt_path_support = 0.5; }; diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index 573c17155c9..f85e602c775 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -428,7 +428,7 @@ class VCFTraversalFinder : public TraversalFinder { size_t max_traversal_cutoff; /// Maximum number of pruning iterations - size_t max_prune_iterations = 1000; + size_t max_prune_iterations = 2; /// Include snarl endpoints in traversals bool include_endpoints = true; @@ -453,7 +453,7 @@ class VCFTraversalFinder : public TraversalFinder { FastaReference* fasta_ref = nullptr, FastaReference* ins_ref = nullptr, function skip_alt = nullptr, - size_t max_traversal_cutoff = 500000); + size_t max_traversal_cutoff = 50000); virtual ~VCFTraversalFinder(); From 8f978c12d2a7253c8faf82cd42b626b7f739b5f2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 13 Nov 2019 22:41:17 -0500 Subject: [PATCH 49/79] better vcf output buffering --- src/graph_caller.cpp | 15 +++++++++------ src/graph_caller.hpp | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 22622e475e0..141ac63bede 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -45,6 +45,7 @@ void GraphCaller::call_top_level_snarls(bool recurse_on_fail) { } VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name) { + output_variants.resize(get_thread_count()); } VCFOutputCaller::~VCFOutputCaller() { @@ -74,17 +75,19 @@ string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector all_variants; + for (const auto& buf : output_variants) { + all_variants.reserve(all_variants.size() + buf.size()); + std::move(buf.begin(), buf.end(), std::back_inserter(all_variants)); + } + std::sort(all_variants.begin(), all_variants.end(), [](const vcflib::Variant& v1, const vcflib::Variant& v2) { return v1.sequenceName < v2.sequenceName || (v1.sequenceName == v2.sequenceName && v1.position < v2.position); }); - for (auto v : output_variants) { + for (auto v : all_variants) { v.setVariantCallFile(output_vcf); out_stream << v << endl; } diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 2f4d8c38bbb..5845be1f666 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -74,8 +74,8 @@ class VCFOutputCaller { /// Sample name string sample_name; - /// output buffer (for sorting) - mutable vector output_variants; + /// output buffers (1/thread) (for sorting) + mutable vector> output_variants; }; /** From 2bf760d4e1174aed1e9e140bf09e68db5c8318d3 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 14 Nov 2019 09:14:35 -0500 Subject: [PATCH 50/79] use caching in traversal support finder --- src/subcommand/call_main.cpp | 4 +-- src/traversal_support.cpp | 65 +++++++++++++++++++++++++++++++++++- src/traversal_support.hpp | 28 ++++++++++++++++ 3 files changed, 94 insertions(+), 3 deletions(-) diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index baae2060516..c282b1122c1 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -265,8 +265,8 @@ int main_call(int argc, char** argv) { // Load our packed supports (they must have come from vg pack on graph) packer = unique_ptr(new Packer(graph)); packer->load_from_file(pack_filename); - // Make a packed traversal support finder - PackedTraversalSupportFinder* packed_support_finder = new PackedTraversalSupportFinder(*packer, *snarl_manager); + // Make a packed traversal support finder (using cached veresion important for poisson caller) + PackedTraversalSupportFinder* packed_support_finder = new CachedPackedTraversalSupportFinder(*packer, *snarl_manager); support_finder = unique_ptr(packed_support_finder); SupportBasedSnarlCaller* packed_caller = nullptr; diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp index 2bcb478b7c6..b561bdefd56 100644 --- a/src/traversal_support.cpp +++ b/src/traversal_support.cpp @@ -305,7 +305,7 @@ Support PackedTraversalSupportFinder::get_edge_support(const edge_t& edge) const } Support PackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse, - id_t to, bool to_reverse) const { + id_t to, bool to_reverse) const { Edge proto_edge; proto_edge.set_from(from); proto_edge.set_from_start(from_reverse); @@ -345,4 +345,67 @@ Support PackedTraversalSupportFinder::get_avg_node_support(id_t node) const { } +CachedPackedTraversalSupportFinder::CachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size) : + PackedTraversalSupportFinder(packer, snarl_manager) { + size_t num_threads = get_thread_count(); + min_node_support_cache.resize(num_threads); + avg_node_support_cache.resize(num_threads); + edge_support_cache.resize(num_threads); + for (size_t i = 0; i < num_threads; ++i) { + min_node_support_cache[i] = new LRUCache(cache_size); + avg_node_support_cache[i] = new LRUCache(cache_size); + edge_support_cache[i] = new LRUCache(cache_size); + } +} + +CachedPackedTraversalSupportFinder::~CachedPackedTraversalSupportFinder() { + for (size_t i = 0; i < min_node_support_cache.size(); ++i) { + delete min_node_support_cache[i]; + delete avg_node_support_cache[i]; + delete edge_support_cache[i]; + } +} + +Support CachedPackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse, + id_t to, bool to_reverse) const { + const HandleGraph* graph = packer.get_graph(); + edge_t edge = graph->edge_handle(graph->get_handle(from, from_reverse), + graph->get_handle(to, to_reverse)); + + auto& support_cache = *edge_support_cache[omp_get_thread_num()]; + pair cached = support_cache.retrieve(edge); + if (cached.second == true) { + return cached.first; + } else { + Support support = PackedTraversalSupportFinder::get_edge_support(from, from_reverse, to, to_reverse); + support_cache.put(edge, support); + return support; + } +} + +Support CachedPackedTraversalSupportFinder::get_min_node_support(id_t node) const { + auto& support_cache = *min_node_support_cache[omp_get_thread_num()]; + pair cached = support_cache.retrieve(node); + if (cached.second == true) { + return cached.first; + } else { + Support support = PackedTraversalSupportFinder::get_min_node_support(node); + support_cache.put(node, support); + return support; + } +} + +Support CachedPackedTraversalSupportFinder::get_avg_node_support(id_t node) const { + auto& support_cache = *avg_node_support_cache[omp_get_thread_num()]; + pair cached = support_cache.retrieve(node); + if (cached.second == true) { + return cached.first; + } else { + Support support = PackedTraversalSupportFinder::get_avg_node_support(node); + support_cache.put(node, support); + return support; + } +} + + } diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp index 42fd10607bb..c26c7899559 100644 --- a/src/traversal_support.hpp +++ b/src/traversal_support.hpp @@ -119,6 +119,34 @@ class PackedTraversalSupportFinder : public TraversalSupportFinder { const Packer& packer; }; +/** + * Add a caching overlay to the PackedTravesalSupportFinder to avoid frequent + * base queries which can become expensive. Even caching the edges seems + * to have an impact + */ +class CachedPackedTraversalSupportFinder : public PackedTraversalSupportFinder { +public: + CachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size = 100000); + virtual ~CachedPackedTraversalSupportFinder(); + + /// Support of an edge + virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const; + + /// Minimum support of a node + virtual Support get_min_node_support(id_t node) const; + + /// Average support of a node + virtual Support get_avg_node_support(id_t node) const; + +protected: + + /// One node cache per threade + mutable vector*> edge_support_cache; + mutable vector*> min_node_support_cache; + mutable vector*> avg_node_support_cache; +}; + + } #endif From b7dc1bbe870c058c9d41e56878d37e39e088fd9b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 14 Nov 2019 10:20:32 -0500 Subject: [PATCH 51/79] remove last critical section in caller --- src/graph_caller.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 141ac63bede..f96de89b2fc 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -15,17 +15,16 @@ GraphCaller::~GraphCaller() { void GraphCaller::call_top_level_snarls(bool recurse_on_fail) { // Used to recurse on children of parents that can't be called - vector snarl_queue; + size_t thread_count = get_thread_count(); + vector> snarl_queue(thread_count); // Run the snarl caller on a snarl, and queue up the children if it fails auto process_snarl = [&](const Snarl* snarl) { bool was_called = call_snarl(*snarl); if (!was_called && recurse_on_fail) { const vector& children = snarl_manager.children_of(snarl); -#pragma omp critical (snarl_queue) - { - snarl_queue.insert(snarl_queue.end(), children.begin(), children.end()); - } + vector& thread_queue = snarl_queue[omp_get_thread_num()]; + thread_queue.insert(thread_queue.end(), children.begin(), children.end()); } }; @@ -33,9 +32,14 @@ void GraphCaller::call_top_level_snarls(bool recurse_on_fail) { snarl_manager.for_each_top_level_snarl_parallel(process_snarl); // Then recurse on any children the snarl caller failed to handle - while (!snarl_queue.empty()) { + while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), + [](const vector& snarl_vec) {return snarl_vec.empty();})) { vector cur_queue; - std::swap(snarl_queue, cur_queue); + for (vector& thread_queue : snarl_queue) { + cur_queue.reserve(cur_queue.size() + thread_queue.size()); + std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); + thread_queue.clear(); + } #pragma omp parallel for for (int i = 0; i < cur_queue.size(); ++i) { process_snarl(cur_queue[i]); From 98ae8bdf0584f2b73ef45beaa1b821994a21ac43 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 Nov 2019 13:31:14 -0800 Subject: [PATCH 52/79] Clear out old cmake stuff to better handle GNU->clang compiler change --- Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index a99320eb884..4ae906710d1 100644 --- a/Makefile +++ b/Makefile @@ -499,9 +499,11 @@ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc # We moved the Dynamic headers so make sure to clean up the old ones. $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/*.hpp $(DYNAMIC_DIR)/include/internal/*.hpp rm -Rf $(INC_DIR)/dynamic.hpp $(INC_DIR)/dynamic - mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/* $(INC_DIR)/dynamic # annoyingly doesn't have an install option on the cmake, so we manually move their external dependency headers - cd $(CWD)/$(DYNAMIC_DIR) && mkdir -p build && cd build && cmake .. && make && cp -r hopscotch_map-prefix/src/hopscotch_map/include/* $(CWD)/$(INC_DIR)/dynamic + cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && cmake .. && make && cp -r hopscotch_map-prefix/src/hopscotch_map/include/* $(CWD)/$(INC_DIR)/ + # Do the copy of the main file last so we can tell if this recipe failed and redo it. + # Otherwise we get dynamic.hpp without its deps + mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/* $(INC_DIR)/dynamic $(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) +. ./source_me.sh && cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="-L/opt/local/lib" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install @@ -617,7 +619,7 @@ $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB . ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ $(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/src/*.cpp $(LIBBDSG_DIR)/include/bdsg/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp - +. ./source_me.sh && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && pwd && cp -r include/bdsg $(CWD)/$(INC_DIR) + +. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && pwd && cp -r include/bdsg $(CWD)/$(INC_DIR) $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp From 5ad96cdc3d4a9986087c8a3c1cf71f8d7d4fbd38 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 14 Nov 2019 16:41:20 -0500 Subject: [PATCH 53/79] more constraints for traversal enumeratrion --- src/snarl_caller.cpp | 18 +++++++++++------- src/snarl_caller.hpp | 8 +++++++- src/traversal_support.cpp | 3 ++- src/traversal_support.hpp | 6 ++++-- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index c7567a90439..3a1754ceb4b 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -305,7 +305,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, shared_travs.push_back(genotype[0]); } // compute the support of our called alleles - vector allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0); + vector allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0); // Compute the total support for all the alts that will be appearing Support total_support = std::accumulate(allele_supports.begin(), allele_supports.end(), Support()); @@ -481,6 +481,9 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, // sort the traversals by support vector ranked_traversals = rank_by_support(supports); size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size()); + size_T max_sec_trav = std::min(top_m, (size_t)ranked_traversals.size()); + // take the top-m traversals in order to check against the top traversal + set top_traversals(ranked_traversals.begin(), ranked_traversals.begin() + max_sec_trav); // the candidate genotypes and their supports. the numbers here are alleles as indexed in traversals[] set> candidates; @@ -507,7 +510,7 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, // we prune out traversals whose exclusive support (structure that is not shared with best traversal) // doesn't meet a certain cutoff - vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, true, false, false, ref_trav_idx); + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, top_traversals, true, false, false, ref_trav_idx); for (int j = 0; j < secondary_exclusive_supports.size(); ++j) { if (j != best_allele && support_val(secondary_exclusive_supports[j]) < min_total_support_for_call && @@ -517,7 +520,7 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, } // get the supports of each traversal in light of best - vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, false, false, false, ref_trav_idx); + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, top_traversals, false, false, false, ref_trav_idx); vector ranked_secondary_traversals = rank_by_support(secondary_supports); // add the homozygous genotype for our best allele @@ -551,7 +554,7 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, double best_genotype_likelihood = -numeric_limits::max(); vector best_genotype; for (const auto& candidate : candidates) { - double gl = genotype_likelihood(candidate, traversals, ref_trav_idx, exp_depth, depth_err); + double gl = genotype_likelihood(candidate, traversals, top_traversals, ref_trav_idx, exp_depth, depth_err); if (gl > best_genotype_likelihood) { best_genotype_likelihood = gl; best_genotype = candidate; @@ -566,12 +569,13 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotype, const vector& traversals, + const set& trav_subset, int ref_trav_idx, double exp_depth, double depth_err) { assert(genotype.size() == 1 || genotype.size() == 2); // get the genotype support - vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, ref_trav_idx); + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, trav_subset, ref_trav_idx); // get the total support over the site Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); @@ -645,7 +649,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, assert(traversals.size() == variant.alleles.size()); // get the genotype support - vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0); + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0); // Get the depth of the site Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); @@ -690,7 +694,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, // assume ploidy 2 for (int i = 0; i < traversals.size(); ++i) { for (int j = i; j < traversals.size(); ++j) { - double gl = genotype_likelihood({i, j}, traversals, 0, exp_depth, depth_err); + double gl = genotype_likelihood({i, j}, traversals, {}, 0, exp_depth, depth_err); gen_likelihoods.push_back(gl); if (vector({i, j}) == genotype || vector({j,i}) == genotype) { gen_likelihood = gl; diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 1be11053721..1ec2ce9ac24 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -214,8 +214,11 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { /// P[allele1] * P[allle2] * P[uncalled alleles] /// Homozygous alleles are split into two, with half support each /// The (natural) logoarithm is returned + /// If trav_subset is not empty, traversals outside that set (and genotype) + /// will be ignored to save time double genotype_likelihood(const vector& genotype, const vector& traversals, + const set& trav_subset, int ref_trav_idx, double exp_depth, double depth_err); /// Rank supports @@ -225,7 +228,10 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { double baseline_mapping_error = 0.005; /// Consider up to the top-k traversals (based on support) for genotyping - size_t top_k = 25; + size_t top_k = 20; + /// Consider up to the tom-m secondary traversals (based on support) for each top traversal + /// (so at most top_k * top_m considered) + size_t top_m = 100; /// Map path name to of depth coverage from the packer const algorithms::BinnedDepthIndex& depth_index; diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp index b561bdefd56..156733f7d92 100644 --- a/src/traversal_support.cpp +++ b/src/traversal_support.cpp @@ -65,13 +65,14 @@ Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& trav vector TraversalSupportFinder::get_traversal_genotype_support(const vector& traversals, const vector& genotype, + const set& other_trav_subset, int ref_trav_idx) { set tgt_trav_set(genotype.begin(), genotype.end()); vector tgt_travs(tgt_trav_set.begin(), tgt_trav_set.end()); // get the support of just the alleles in the genotype, evenly splitting shared stuff vector allele_support = get_traversal_set_support(traversals, tgt_travs, tgt_trav_set, false, false, true, ref_trav_idx); // get the support of everythin else, treating stuff in the genotype alleles as 0 - vector other_support = get_traversal_set_support(traversals, tgt_travs, {}, false, true, false, ref_trav_idx); + vector other_support = get_traversal_set_support(traversals, tgt_travs, other_trav_subset, false, true, false, ref_trav_idx); // combine the above two vectors for (int allele : tgt_travs) { other_support[allele] = allele_support[allele]; diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp index c26c7899559..de1704559d5 100644 --- a/src/traversal_support.hpp +++ b/src/traversal_support.hpp @@ -50,9 +50,11 @@ class TraversalSupportFinder { /// some alleles in a genotype, where everything is split evently among them /// anything not in the genotype gets a support using "exclusive_count" /// where nodes taken by the genotype are counted as 0 + /// stuff not in the genotype is limited to other_trav_subset (or all if empty) virtual vector get_traversal_genotype_support(const vector& traversals, - const vector& genotype, - int ref_trav_idx = -1); + const vector& genotype, + const set& other_trav_subset, + int ref_trav_idx = -1); /// traversals: get support for each traversal in this set /// shared_travs: if a node appears N times in shared_travs, then it will count as 1 / (N+1) support From 04c7536c2d10caeeb191b9a433b0a38c9e63de7a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 14 Nov 2019 16:56:38 -0500 Subject: [PATCH 54/79] typo --- src/snarl_caller.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 3a1754ceb4b..46a589cec09 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -481,7 +481,7 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, // sort the traversals by support vector ranked_traversals = rank_by_support(supports); size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size()); - size_T max_sec_trav = std::min(top_m, (size_t)ranked_traversals.size()); + size_t max_sec_trav = std::min(top_m, (size_t)ranked_traversals.size()); // take the top-m traversals in order to check against the top traversal set top_traversals(ranked_traversals.begin(), ranked_traversals.begin() + max_sec_trav); From c3752cdcb0ab83005edaeb217695d7c6690e00f6 Mon Sep 17 00:00:00 2001 From: Jerven bolleman Date: Thu, 7 Nov 2019 20:19:11 +0100 Subject: [PATCH 55/79] The VG ontology was missing position which we are using for a while already. And also a domain was wrong. --- ontology/vg.html | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- ontology/vg.ttl | 10 +++++++++- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/ontology/vg.html b/ontology/vg.html index cd2449e6754..23e4a11ac1d 100644 --- a/ontology/vg.html +++ b/ontology/vg.html @@ -144,13 +144,13 @@ Properties ( - 7 + 8 ) Object properties ( - 6 + 7 ) @@ -829,9 +831,51 @@

Object properties

rdfs:range + + vg:Node + + + + + + + + + + + + + + + + + + + +
+ vg:position + (rdf:type + + owl:ObjectProperty + ) + +
+ rdfs:comment + "This is the position on the reference path at which this step starts." + + xsd:string +
+ rdfs:domain vg:Step
+ rdfs:label + "position" + + xsd:string +
+ rdfs:range + xsd:positiveInteger
diff --git a/ontology/vg.ttl b/ontology/vg.ttl index ffca973c4dd..95bc7e6e975 100644 --- a/ontology/vg.ttl +++ b/ontology/vg.ttl @@ -71,8 +71,16 @@ rdfs:comment "This means that this step occurs on the forward strand of the sequence attaced to the node (i.e. it is on the explicit encoded forward (5' to 3') strand) of the predicate node."^^xsd:string ; rdfs:domain :Step ; rdfs:label "node"^^xsd:string ; - rdfs:range :Step ; + rdfs:range :Node ; . + +:position + rdf:type owl:ObjectProperty ; + rdfs:comment "This is the position on the reference path at which this step starts."^^xsd:string ; + rdfs:domain :Step ; + rdfs:label "position"^^xsd:string ; + rdfs:range xsd:positiveInteger . + :rank rdf:type owl:DatatypeProperty ; rdfs:comment "The rank records the step place along its path."^^xsd:string ; From c5a5fa4b1501f3dcab28a31baa8f1ee9feb525b9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 18 Nov 2019 16:08:23 -0500 Subject: [PATCH 56/79] pad depth coverage check and just use baseline error --- src/snarl_caller.cpp | 19 ++++++++++++++++--- src/snarl_caller.hpp | 3 +++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 46a589cec09..41fb0d4eb59 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -599,7 +599,12 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp } // how many reads would we expect to not map to our genotype due to error - double error_rate = std::min(0.05, depth_err + baseline_mapping_error); + // Note: The bin size is set quite a bit smaller than originally intended as it seems to + // help nearly nevery benchmark. But the small bin sizes means that depth_err, the + // error from the binned coverage, is way too high and including it only causes trouble. + // tldr: just use the baseline_mapping_error constant and forget about depth_err for now. + //double error_rate = std::min(0.05, depth_err + baseline_mapping_error); + double error_rate = baseline_mapping_error; double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support); // and our likelihood for the unmapped reads we see: @@ -648,6 +653,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, assert(traversals.size() == variant.alleles.size()); + // get the traversal sizes + vector traversal_sizes = support_finder.get_traversal_sizes(traversals); + // get the genotype support vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0); @@ -684,8 +692,13 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, double gen_likelihood; variant.format.push_back("GL"); - // expected depth from our coverage - pair ref_range = make_pair(variant.position, variant.position + variant.ref.length()); + // expected depth from our coverage. we look at the reference-range from the snarl plus a bit of padding, + // averaging over every depth bin this touches. todo: adaptively compute nearby coverage without bins + // (requires VCFGenotyper to be refactored to require a PathPositionIndex) + size_t longest_traversal = *max_element(traversal_sizes.begin(), traversal_sizes.end()); + size_t padding = (depth_padding_factor * longest_traversal) / 2; + pair ref_range = make_pair(max((long)0, (long)(variant.position - padding)), + variant.position + variant.ref.length() + padding); auto depth_info = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first, ref_range.second); double exp_depth = depth_info.first; double depth_err = depth_info.second; diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 1ec2ce9ac24..b2319c9197c 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -232,6 +232,9 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { /// Consider up to the tom-m secondary traversals (based on support) for each top traversal /// (so at most top_k * top_m considered) size_t top_m = 100; + + /// padding to apply wrt to longest traversal to snarl ranges when looking up binned depth + double depth_padding_factor = 1.; /// Map path name to of depth coverage from the packer const algorithms::BinnedDepthIndex& depth_index; From 64428e2af14dace4bc118fc4cfb127a99166bae8 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 19 Nov 2019 16:58:02 -0500 Subject: [PATCH 57/79] allow 1b bin and its nan variance --- src/algorithms/coverage_depth.cpp | 4 ++-- src/snarl_caller.cpp | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index 7429a3b6e0d..3e39894c980 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -90,7 +90,7 @@ pair packed_depth_of_bin(const Packer& packer, } } } - return wellford_mean_var(bin_length, mean, M2, true); + return wellford_mean_var(bin_length, mean, M2); } vector> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size, @@ -225,7 +225,7 @@ static pair combine_and_average_node_coverages(const HandleGraph } } - return wellford_mean_var(count, mean, M2, count < graph.get_node_count()); + return wellford_mean_var(count, mean, M2); } diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index 41fb0d4eb59..26333773822 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -547,8 +547,9 @@ vector PoissonSupportSnarlCaller::genotype(const Snarl& snarl, // expected depth from our coverage auto depth_info = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first, ref_range.second); double exp_depth = depth_info.first; - double depth_err = depth_info.second; - assert(!isnan(exp_depth) && !isnan(depth_err)); + assert(!isnan(exp_depth)); + // variance/std-err can be nan when binsize < 2. We just clamp it to 0 + double depth_err = depth_info.second ? !isnan(depth_info.second) : 0.; // genotype (log) likelihoods double best_genotype_likelihood = -numeric_limits::max(); @@ -701,8 +702,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, variant.position + variant.ref.length() + padding); auto depth_info = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first, ref_range.second); double exp_depth = depth_info.first; - double depth_err = depth_info.second; - assert(!isnan(exp_depth) && !isnan(depth_err)); + assert(!isnan(exp_depth)); + // variance/std-err can be nan when binsize < 2. We just clamp it to 0 + double depth_err = depth_info.second ? !isnan(depth_info.second) : 0.; // assume ploidy 2 for (int i = 0; i < traversals.size(); ++i) { From dd9113afb32dc8658fb744227ed6957b3f400f96 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 20 Nov 2019 10:44:07 -0500 Subject: [PATCH 58/79] Cut softclips by default in vg augment. Disable with -S instead of enabling with -C --- README.md | 7 ++++--- src/subcommand/augment_main.cpp | 13 ++++++++----- test/t/04_vg_align.t | 4 ++-- test/t/17_vg_augment.t | 14 +++++++------- test/t/18_vg_call.t | 2 +- 5 files changed, 22 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 028a68ae0e7..f2156bab557 100644 --- a/README.md +++ b/README.md @@ -224,11 +224,12 @@ Variation from alignments can be embedded back into the graph. This process is ```sh # augment the graph with all variation from the GAM except that implied by soft clips, saving to aug.vg. aug.gam contains the same reads as aln.gam but mapped to aug.vg -vg augment x.vg aln.gam -C -A aug.gam > aug.vg +vg augment x.vg aln.gam -A aug.gam > aug.vg # augment the graph with all variation from the GAM, saving each mapping as a path in the graph. +# softclips of alignment paths are preserved (`-S`). # Note, this can be much less efficient than the above example if there are many alignments in the GAM -vg augment x.vg aln.gam -i > aug_with_paths.vg +vg augment x.vg aln.gam -i -S > aug_with_paths.vg ``` ### Variant Calling @@ -247,7 +248,7 @@ vg pack -x x.xg -g aln.gam -Q 5 -o aln.pack vg call x.xg -k aln.pack > graph_calls.vcf ``` -In order to also consider *novel* variants from the reads, use the augmented graph and gam (as created in the previous example using `vg augment -C -A`): +In order to also consider *novel* variants from the reads, use the augmented graph and gam (as created in the previous example using `vg augment -A`): ```sh # Index our augmented graph diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp index 994a65c4050..c694628e22c 100644 --- a/src/subcommand/augment_main.cpp +++ b/src/subcommand/augment_main.cpp @@ -44,7 +44,7 @@ void help_augment(char** argv, ConfigurableParser& parser) { << endl << "general options:" << endl << " -i, --include-paths merge the paths implied by alignments into the graph" << endl - << " -C, --cut-softclips drop softclips from the paths (recommended)" << endl + << " -S, --keep-softclips include softclips from input alignments (they are cut by default)" << endl << " -B, --label-paths don't augment with alignments, just use them for labeling the graph" << endl << " -Z, --translation FILE save translations from augmented back to base graph to FILE" << endl << " -A, --alignment-out FILE save augmented GAM reads to FILE" << endl @@ -74,7 +74,7 @@ int main_augment(int argc, char** argv) { bool include_paths = false; // Include the softclips for each path - bool include_softclips = true; + bool include_softclips = false; // Just label the paths with the GAM bool label_paths = false; @@ -121,7 +121,7 @@ int main_augment(int argc, char** argv) { {"translation", required_argument, 0, 'Z'}, {"alignment-out", required_argument, 0, 'A'}, {"include-paths", no_argument, 0, 'i'}, - {"cut-softclips", no_argument, 0, 'C'}, + {"keep-softclips", no_argument, 0, 'S'}, {"label-paths", no_argument, 0, 'B'}, {"subgraph", no_argument, 0, 's'}, {"min-coverage", required_argument, 0, 'm'}, @@ -137,7 +137,7 @@ int main_augment(int argc, char** argv) { {"include-gt", required_argument, 0, 'L'}, {0, 0, 0, 0} }; - static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:q:Q:"; + static const char* short_options = "a:Z:A:iCSBhpvt:l:L:sm:c:q:Q:"; optind = 2; // force optind past command positional arguments // This is our command-line parser @@ -160,7 +160,10 @@ int main_augment(int argc, char** argv) { include_paths = true; break; case 'C': - include_softclips = false; + cerr << "[vg augment] warning: -C / --cut-softclips option is deprecated (now enabled by default)" << endl; + break; + case 'S': + include_softclips = true; break; case 'B': label_paths = true; diff --git a/test/t/04_vg_align.t b/test/t/04_vg_align.t index cb78999eca4..d78525815fd 100644 --- a/test/t/04_vg_align.t +++ b/test/t/04_vg_align.t @@ -37,8 +37,8 @@ is $(vg align -js GGCTATGTCTGAACTAGGAGGGTAGAAAGAATATTCATTTTGGTTGCCACAAACCATCGAAA vg construct -m 1000 -r tiny/tiny.fa >t.vg seq=CAAATAAGGCTTGGAAATGTTCTGGAGTTCTATTATATTCCAACTCTCTT -vg align -s $seq t.vg | vg augment t.vg - -i >t2.vg -is $(vg align -s $seq -Q query t2.vg | vg augment t2.vg - -i -B | vg view - | grep "query" | cut -f 3 | grep -o "[0-9]\+" | wc -l) 4 "align can use query names and outputs GAM" +vg align -s $seq t.vg | vg augment t.vg - -i -S >t2.vg +is $(vg align -s $seq -Q query t2.vg | vg augment t2.vg - -i -B -S | vg view - | grep "query" | cut -f 3 | grep -o "[0-9]\+" | wc -l) 4 "align can use query names and outputs GAM" rm t.vg t2.vg diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t index ed66c2dc910..99a52fe7977 100644 --- a/test/t/17_vg_augment.t +++ b/test/t/17_vg_augment.t @@ -61,8 +61,8 @@ rm -rf t.idx.xg t.idx.gcsa read_aug.gam vg construct -v tiny/tiny.vcf.gz -r tiny/tiny.fa >t.vg vg align -s GGGGGGGAAATTTTCTGGAGTTCTATTATATTCCAAAAAAAAAA t.vg >t.gam -is $(vg augment -i t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i t.vg t.gam | vg stats -H - | awk '{ print $3}') | cut -f 3) GGGGG "a soft clip at read start becomes a new head of the graph" -is $(vg augment -i t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i t.vg t.gam | vg stats -T - | awk '{ print $3}') | cut -f 3) AAAAAAAA "a soft clip at read end becomes a new tail of the graph" +is $(vg augment -i -S t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i -S t.vg t.gam | vg stats -H - | awk '{ print $3}') | cut -f 3) GGGGG "a soft clip at read start becomes a new head of the graph" +is $(vg augment -i -S t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i -S t.vg t.gam | vg stats -T - | awk '{ print $3}') | cut -f 3) AAAAAAAA "a soft clip at read end becomes a new tail of the graph" vg align -s AAATTTTCTGGAGTTCTAT t.vg >> t.gam vg find -x t.vg -n 9 -c 1 > n9.vg vg augment n9.vg t.gam -s -A n9_aug.gam > /dev/null @@ -72,7 +72,7 @@ rm -rf t.vg t.gam n9.vg n9_aug.gam vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg -g x.gcsa -k 16 x.vg vg map -x x.xg -g x.gcsa -G small/x-s1337-n100-e0.01-i0.005.gam -t 1 >x.gam -vg augment -Z x.trans -i x.vg x.gam >x.mod.vg +vg augment -Z x.trans -i -S x.vg x.gam >x.mod.vg is $(vg view -Z x.trans | wc -l) 1288 "the expected graph translation is exported when the graph is edited" rm -rf x.vg x.xg x.gcsa x.reads x.gam x.mod.vg x.trans @@ -82,17 +82,17 @@ vg index -x 2snp.xg 2snp.vg vg sim -l 30 -x 2snp.xg -n 30 -a >2snp.sim vg index -x flat.xg -g flat.gcsa -k 16 flat.vg vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam -is $(vg augment flat.vg 2snp.gam -i | vg mod -D - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes" +is $(vg augment flat.vg 2snp.gam -i -S | vg mod -D - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes" vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTATCTGGAGTTCTATTATATCCCAACTCTCTG/' | vg view -Fv - >2err.vg vg sim -l 30 -x 2err.vg -n 10 -a >2err.sim vg map -g flat.gcsa -x flat.xg -G 2err.sim -k 8 >2err.gam cat 2snp.gam 2err.gam > 4edits.gam -vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort > 2snp_default.nodes -vg augment flat.vg 2snp.gam -m 1 | vg view - | grep S | awk '{print $3}' | sort > 2snp_m1.nodes +vg augment flat.vg 2snp.gam -S | vg view - | grep S | awk '{print $3}' | sort > 2snp_default.nodes +vg augment flat.vg 2snp.gam -m 1 -S | vg view - | grep S | awk '{print $3}' | sort > 2snp_m1.nodes diff 2snp_default.nodes 2snp_m1.nodes is "$?" 0 "augmenting 2 snps with -m 1 produces the same nodes as default" -vg augment flat.vg 4edits.gam -m 11 | vg view - | grep S | awk '{print $3}' | sort > 4edits_m11.nodes +vg augment flat.vg 4edits.gam -m 11 -S | vg view - | grep S | awk '{print $3}' | sort > 4edits_m11.nodes diff 2snp_default.nodes 4edits_m11.nodes is "$?" 0 "augmenting 2 snps and 2 errors with -m 11 produces the same nodes as with just the snps" diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index a15baac7044..b58a699f738 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -104,6 +104,6 @@ vg augment c.vg m.gam -A m.aug.gam >c.aug.vg vg index -x c.aug.xg c.aug.vg vg pack -x c.aug.xg -g m.aug.gam -o m.aug.pack vg call c.aug.xg -k m.aug.pack >m.vcf -is $(cat m.vcf | grep -v "^#" | wc -l) 3 "vg call finds true homozygous variants in a cyclic graph" +is $(cat m.vcf | grep -v "^#" | wc -l) 4 "vg call finds true homozygous variants in a cyclic graph" rm -f c.vg c.xg c.gcsa c.gcsa.lcp m.fa m.vg m.xg m.sim m.gam m.aug.gam c.aug.vg c.aug.xg m.aug.pack m.vcf From 01e0aac083a909d9e5bd2a2350b8ebc788be1e47 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 20 Nov 2019 13:25:01 -0500 Subject: [PATCH 59/79] deprecate instead of remove -C --- src/subcommand/augment_main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp index c694628e22c..a9cc8effe58 100644 --- a/src/subcommand/augment_main.cpp +++ b/src/subcommand/augment_main.cpp @@ -121,6 +121,7 @@ int main_augment(int argc, char** argv) { {"translation", required_argument, 0, 'Z'}, {"alignment-out", required_argument, 0, 'A'}, {"include-paths", no_argument, 0, 'i'}, + {"cut-softclips", no_argument, 0, 'C'}, {"keep-softclips", no_argument, 0, 'S'}, {"label-paths", no_argument, 0, 'B'}, {"subgraph", no_argument, 0, 's'}, From fe1b439fc7a77f270cfe065afeb5669136f9ad5f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Nov 2019 12:00:09 -0500 Subject: [PATCH 60/79] handlify vg_set --- deps/libbdsg | 2 +- deps/libhandlegraph | 2 +- deps/xg | 2 +- src/io/save_handle_graph.hpp | 64 ++++++++++++++++++++++++ src/subcommand/augment_main.cpp | 17 ++----- src/subcommand/convert_main.cpp | 15 ++---- src/subcommand/ids_main.cpp | 37 ++++++++++---- src/vg_set.cpp | 88 +++++++++++++++------------------ src/vg_set.hpp | 9 ++-- 9 files changed, 146 insertions(+), 90 deletions(-) create mode 100644 src/io/save_handle_graph.hpp diff --git a/deps/libbdsg b/deps/libbdsg index d69763eca9f..6c57975dc96 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit d69763eca9fe796bdeb5abd050a585934a8b6407 +Subproject commit 6c57975dc969403b6cd8ae0017315b176812a793 diff --git a/deps/libhandlegraph b/deps/libhandlegraph index 541b97315fd..729d2c86805 160000 --- a/deps/libhandlegraph +++ b/deps/libhandlegraph @@ -1 +1 @@ -Subproject commit 541b97315fd413846f5a76476907f8d2b2276242 +Subproject commit 729d2c868053d2e2cbe89f9ecf46ee641235ed52 diff --git a/deps/xg b/deps/xg index fb89754ecde..e3ee79f0550 160000 --- a/deps/xg +++ b/deps/xg @@ -1 +1 @@ -Subproject commit fb89754ecde62ddfd4758e4e37004839daeade78 +Subproject commit e3ee79f055083a298f7d04a5fb0d56dd34967b7c diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp new file mode 100644 index 00000000000..9085ac5d3c1 --- /dev/null +++ b/src/io/save_handle_graph.hpp @@ -0,0 +1,64 @@ +#ifndef VG_IO_SAVE_HANDLE_GRAPH_IO_HPP_INCLUDED +#define VG_IO_REGISTER_LIBVG_IO_HPP_INCLUDED + +/** + * \save_handle_graph.hpp + * Use vpkg to serialize a HandleGraph object + */ + +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "bdsg/odgi.hpp" +#include "vg.hpp" +#include "xg.hpp" +#include +#include + +namespace vg { + +namespace io { + +using namespace std; + + +/** + * Save a handle graph using the VPKG::save() function. + * Todo: should this be somewhere else (ie in vgio with new types registered?) + */ +inline void save_handle_graph(HandleGraph* graph, ostream& os) { + if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), os); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), os); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), os); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), os); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), os); + } else { + throw runtime_error("Internal error: unable to serialize graph"); + } +} + +inline void save_handle_graph(HandleGraph* graph, const string& dest_path) { + if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), dest_path); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), dest_path); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), dest_path); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), dest_path); + } else if (dynamic_cast(graph) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph), dest_path); + } else { + throw runtime_error("Internal error: unable to serialize graph"); + } +} + +} + +} + +#endif diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp index a9cc8effe58..5bf9918b90c 100644 --- a/src/subcommand/augment_main.cpp +++ b/src/subcommand/augment_main.cpp @@ -26,6 +26,7 @@ #include "../vg.hpp" #include "../augment.hpp" #include "../packer.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include @@ -34,6 +35,7 @@ #include "bdsg/odgi.hpp" #include + using namespace std; using namespace vg; using namespace vg::subcommand; @@ -388,19 +390,8 @@ int main_augment(int argc, char** argv) { } } - // Serialize the graph using VPKG. Todo: is there away to do this in one line? - // could just call serialie() directly if willing to forego vpkg... - if (vg_graph != nullptr) { - vg::io::VPKG::save(*vg_graph, cout); - } else if (dynamic_cast(graph.get()) != nullptr) { - vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); - } else if (dynamic_cast(graph.get()) != nullptr) { - vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); - } else if (dynamic_cast(graph.get()) != nullptr) { - vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); - } else { - throw runtime_error("Internal error: vg augment cannot output this graph format"); - } + // Serialize the graph using VPKG. + vg::io::save_handle_graph(graph.get(), cout); return 0; } diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index cd3fa4f476d..688b41a9a2f 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -3,6 +3,7 @@ #include "../utility.hpp" #include "xg.hpp" #include "../convert_handle.hpp" +#include "../io/save_handle_graph.hpp" #include #include @@ -130,18 +131,8 @@ int main_convert(int argc, char** argv) { convert_handle_graph(input_graph.get(), mutable_output_graph); } - // Serialize the graph using VPKG. Todo: is there away to do this in one line? - if (output_format == "vg") { - vg::io::VPKG::save(*dynamic_cast(output_graph.get()), cout); - } else if (output_format == "hash") { - vg::io::VPKG::save(*dynamic_cast(output_graph.get()), cout); - } else if (output_format == "packed") { - vg::io::VPKG::save(*dynamic_cast(output_graph.get()), cout); - } else if (output_format == "xg") { - vg::io::VPKG::save(*dynamic_cast(output_graph.get()), cout); - } else if (output_format == "odgi") { - vg::io::VPKG::save(*dynamic_cast(output_graph.get()), cout); - } + // Serialize the graph using VPKG. + vg::io::save_handle_graph(output_graph.get(), cout); return 0; } diff --git a/src/subcommand/ids_main.cpp b/src/subcommand/ids_main.cpp index 2b7c831f857..fa42a64670e 100644 --- a/src/subcommand/ids_main.cpp +++ b/src/subcommand/ids_main.cpp @@ -15,7 +15,14 @@ #include "../vg.hpp" #include "../vg_set.hpp" #include "../algorithms/topological_sort.hpp" - +#include +#include +#include +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "bdsg/odgi.hpp" +#include +#include "../io/save_handle_graph.hpp" #include using namespace std; @@ -110,19 +117,30 @@ int main_ids(int argc, char** argv) { } if (!join && mapping_name.empty()) { - VG* graph; + unique_ptr graph; get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in); - }); + graph = vg::io::VPKG::load_one(in); + }); if (sort) { // Set up the nodes so we go through them in topological order - graph->sort(); + graph->apply_ordering(algorithms::topological_order(graph.get()), true); } - if (compact || sort) { + if (compact && !sort) { // Compact only, or compact to re-assign IDs after sort - graph->compact_ids(); + VG* vg_graph = dynamic_cast(graph.get()); + if (vg_graph != nullptr) { + vg_graph->compact_ids(); + } else { + // try to use thie compact-option from apply_ordering + vector graph_ordering(graph->get_node_count()); + size_t i = 0; + graph->for_each_handle([&](handle_t handle) { + graph_ordering[i++] = handle; + }); + graph->apply_ordering(graph_ordering, true); + } } if (increment != 0) { @@ -130,11 +148,10 @@ int main_ids(int argc, char** argv) { } if (decrement != 0) { - graph->decrement_node_ids(decrement); + graph->increment_node_ids(-increment); } - graph->serialize_to_ostream(std::cout); - delete graph; + vg::io::save_handle_graph(graph.get(), cout); } else { vector graph_file_names; diff --git a/src/vg_set.cpp b/src/vg_set.cpp index f3e467a952a..7a60b06bd25 100644 --- a/src/vg_set.cpp +++ b/src/vg_set.cpp @@ -1,72 +1,62 @@ #include "vg_set.hpp" #include #include "source_sink_overlay.hpp" +#include +#include +#include "io/save_handle_graph.hpp" namespace vg { -// sets of VGs on disk +// sets of MutablePathMutableHandleGraphs on disk -void VGset::transform(std::function lambda) { +void VGset::transform(std::function lambda) { for (auto& name : filenames) { // load - VG* g = NULL; - if (name == "-") { - g = new VG(std::cin, show_progress & progress_bars); - } else { - ifstream in(name.c_str()); - if (!in) throw ifstream::failure("failed to open " + name); - g = new VG(in, show_progress & progress_bars); - in.close(); + unique_ptr g; + get_input_file(name, [&](istream& in) { + // Note: I would have liked to just load a MutableHandleGraph here but the resulting pointer + // is broken (tested: VG and PackedGraph) + g = vg::io::VPKG::load_one(in); + }); + // legacy: + VG* vg_g = dynamic_cast(g.get()); + if (vg_g != nullptr) { + vg_g->name = name; } - g->name = name; // apply - lambda(g); + lambda(g.get()); // write to the same file - ofstream out(name.c_str()); - g->serialize_to_ostream(out); - out.close(); - delete g; + vg::io::save_handle_graph(g.get(), name); } } -void VGset::for_each(std::function lambda) { +void VGset::for_each(std::function lambda) { for (auto& name : filenames) { // load - VG* g = NULL; - if (name == "-") { - g = new VG(std::cin, show_progress & progress_bars); - } else { - ifstream in(name.c_str()); - if (!in) throw ifstream::failure("failed to open " + name); - g = new VG(in, show_progress & progress_bars); - in.close(); - } - g->name = name; + unique_ptr g; + get_input_file(name, [&](istream& in) { + g = vg::io::VPKG::load_one(in); + }); + // legacy: + VG* vg_g = dynamic_cast(g.get()); + if (vg_g != nullptr) { + vg_g->name = name; + } // apply - lambda(g); - delete g; - } -} - -void VGset::for_each_graph_chunk(std::function lamda) { - for (auto& name : filenames) { - ifstream in(name.c_str()); - vg::io::for_each(in, lamda); + lambda(g.get()); } } id_t VGset::max_node_id(void) { id_t max_id = 0; - for_each_graph_chunk([&](const Graph& graph) { - for (size_t i = 0; i < graph.node_size(); ++i) { - max_id = max(graph.node(i).id(), max_id); - } + for_each([&](HandleGraph* graph) { + max_id = max(graph->max_node_id(), max_id); }); return max_id; } int64_t VGset::merge_id_space(void) { int64_t max_node_id = 0; - auto lambda = [&max_node_id](VG* g) { + auto lambda = [&max_node_id](MutableHandleGraph* g) { if (max_node_id > 0) g->increment_node_ids(max_node_id); max_node_id = g->max_node_id(); }; @@ -183,9 +173,13 @@ void VGset::to_xg(xg::XG& index, const function& paths_to_t } void VGset::for_each_kmer_parallel(size_t kmer_size, const function& lambda) { - for_each([&lambda, kmer_size, this](VG* g) { - g->show_progress = show_progress & progress_bars; - g->preload_progress("processing kmers of " + g->name); + for_each([&lambda, kmer_size, this](HandleGraph* g) { + // legacy + VG* vg_g = dynamic_cast(g); + if (vg_g != nullptr) { + vg_g->show_progress = show_progress & progress_bars; + vg_g->preload_progress("processing kmers of " + vg_g->name); + } //g->for_each_kmer_parallel(kmer_size, path_only, edge_max, lambda, stride, allow_dups, allow_negatives); for_each_kmer(*g, kmer_size, lambda); }); @@ -209,7 +203,7 @@ void VGset::write_gcsa_kmers_ascii(ostream& out, int kmer_size, cout << kp << endl; }; - for_each([&](VG* g) { + for_each([&](HandleGraph* g) { // Make an overlay for each graph, without modifying it. Break into tip-less cycle components. // Make sure to use a consistent head and tail ID across all graphs in the set. SourceSinkOverlay overlay(g, kmer_size, head_id, tail_id); @@ -234,7 +228,7 @@ void VGset::write_gcsa_kmers_binary(ostream& out, int kmer_size, size_t& size_li } size_t total_size = 0; - for_each([&](VG* g) { + for_each([&](HandleGraph* g) { // Make an overlay for each graph, without modifying it. Break into tip-less cycle components. // Make sure to use a consistent head and tail ID across all graphs in the set. SourceSinkOverlay overlay(g, kmer_size, head_id, tail_id); @@ -262,7 +256,7 @@ vector VGset::write_gcsa_kmers_binary(int kmer_size, size_t& size_limit, vector tmpnames; size_t total_size = 0; - for_each([&](VG* g) { + for_each([&](HandleGraph* g) { // Make an overlay for each graph, without modifying it. Break into tip-less cycle components. // Make sure to use a consistent head and tail ID across all graphs in the set. SourceSinkOverlay overlay(g, kmer_size, head_id, tail_id); diff --git a/src/vg_set.hpp b/src/vg_set.hpp index 8f8b4cce746..a3d069b9d20 100644 --- a/src/vg_set.hpp +++ b/src/vg_set.hpp @@ -6,7 +6,7 @@ #include #include #include -#include "vg.hpp" +#include "handle.hpp" #include "index.hpp" #include "xg.hpp" #include "kmer.hpp" @@ -14,7 +14,7 @@ namespace vg { -// for dealing with collections of VGs on disk +// for dealing with collections of HandleGraphs on disk class VGset { public: @@ -26,9 +26,8 @@ class VGset { : filenames(files) { }; - void transform(std::function lambda); - void for_each(std::function lambda); - void for_each_graph_chunk(std::function lamda); + void transform(std::function lambda); + void for_each(std::function lambda); /// Stream through the files and determine the max node id id_t max_node_id(void); From 88e355ab1f82a03e5c4105e4169c32922f67a819 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Nov 2019 12:09:07 -0500 Subject: [PATCH 61/79] only increment ids as-needed in vg ids -j --- src/vg_set.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vg_set.cpp b/src/vg_set.cpp index 7a60b06bd25..7ad99c668ff 100644 --- a/src/vg_set.cpp +++ b/src/vg_set.cpp @@ -57,7 +57,10 @@ id_t VGset::max_node_id(void) { int64_t VGset::merge_id_space(void) { int64_t max_node_id = 0; auto lambda = [&max_node_id](MutableHandleGraph* g) { - if (max_node_id > 0) g->increment_node_ids(max_node_id); + int64_t delta = max_node_id - g->min_node_id(); + if (delta >= 0) { + g->increment_node_ids(delta + 1); + } max_node_id = g->max_node_id(); }; transform(lambda); From d75981034d00051736bf16bd9bcb4e3fb21c4b19 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Nov 2019 09:17:27 -0500 Subject: [PATCH 62/79] more vg chunk handlification --- src/chunker.cpp | 116 ++++++++++++++++++---------------- src/chunker.hpp | 4 +- src/io/save_handle_graph.hpp | 21 +++++- src/subcommand/chunk_main.cpp | 49 +++++++++++--- test/t/30_vg_chunk.t | 5 +- 5 files changed, 127 insertions(+), 68 deletions(-) diff --git a/src/chunker.cpp b/src/chunker.cpp index 73066b34b41..6bea18f6087 100644 --- a/src/chunker.cpp +++ b/src/chunker.cpp @@ -3,6 +3,7 @@ #include #include "chunker.hpp" #include "algorithms/subgraph.hpp" +#include "convert_handle.hpp" //#define debug @@ -19,7 +20,16 @@ PathChunker::~PathChunker() { } void PathChunker::extract_subgraph(const Region& region, int context, int length, bool forward_only, - VG& subgraph, Region& out_region) { + MutablePathMutableHandleGraph& subgraph, Region& out_region) { + // This method still depends on VG + // (not a super high priority to port, as calling can now be done at genome scale and we no longer + // have to chunk up paths) + VG* vg_subgraph = dynamic_cast(&subgraph); + if (vg_subgraph == nullptr) { + vg_subgraph = new VG(); + assert(subgraph.get_node_count() == 0); + } + // extract our path range into the graph path_handle_t path_handle = graph->get_path_handle(region.seq); step_handle_t start_step = graph->get_step_at_position(path_handle, region.start); @@ -42,28 +52,28 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length if (graph->get_is_reverse(step_handle)) { step_handle = graph->flip(step_handle); } - if (!subgraph.has_node(graph->get_id(step_handle))) { - subgraph.create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); + if (!vg_subgraph->has_node(graph->get_id(step_handle))) { + vg_subgraph->create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); } }; // expand the context and get path information // if forward_only true, then we only go forward. if (context > 0) { - algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only); + algorithms::expand_subgraph_by_steps(*graph, *vg_subgraph, context, forward_only); } if (length > 0) { - algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only); + algorithms::expand_subgraph_by_length(*graph, *vg_subgraph, context, forward_only); } else if (context == 0 && length == 0) { - algorithms::add_connecting_edges_to_subgraph(*graph, subgraph); + algorithms::add_connecting_edges_to_subgraph(*graph, *vg_subgraph); } - algorithms::add_subpaths_to_subgraph(*graph, subgraph); + algorithms::add_subpaths_to_subgraph(*graph, *vg_subgraph); // build the vg of the subgraph - subgraph.remove_orphan_edges(); + vg_subgraph->remove_orphan_edges(); // get our range endpoints before context expansion - list& mappings = subgraph.paths.get_path(region.seq); + list& mappings = vg_subgraph->paths.get_path(region.seq); assert(!mappings.empty()); size_t mappings_size = mappings.size(); int64_t input_start_node = graph->get_id(start_handle); @@ -126,13 +136,13 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length for (; prev_it != mappings.begin(); --prev_it) { cur_it = prev_it; --cur_it; - handle_t prev_handle = subgraph.get_handle(prev_it->node_id(), + handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), prev_it->is_reverse()); - handle_t cur_handle = subgraph.get_handle(cur_it->node_id(), + handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), cur_it->is_reverse()); - edge_t edge = subgraph.edge_handle(cur_handle, prev_handle); - if (!path_edge_set.count(make_pair(make_pair(subgraph.get_id(edge.first), subgraph.get_is_reverse(edge.first)), - make_pair(subgraph.get_id(edge.second), subgraph.get_is_reverse(edge.second))))) { + edge_t edge = vg_subgraph->edge_handle(cur_handle, prev_handle); + if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), + make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { #ifdef debug #pragma omp critical(cerr) { @@ -150,13 +160,13 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length cur_it = end_it; prev_it = cur_it; for (++cur_it; cur_it != mappings.end(); ++prev_it, ++cur_it) { - handle_t prev_handle = subgraph.get_handle(prev_it->node_id(), + handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), prev_it->is_reverse()); - handle_t cur_handle = subgraph.get_handle(cur_it->node_id(), + handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), cur_it->is_reverse()); - edge_t edge = subgraph.edge_handle(prev_handle, cur_handle); - if (!path_edge_set.count(make_pair(make_pair(subgraph.get_id(edge.first), subgraph.get_is_reverse(edge.first)), - make_pair(subgraph.get_id(edge.second), subgraph.get_is_reverse(edge.second))))) { + edge_t edge = vg_subgraph->edge_handle(prev_handle, cur_handle); + if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), + make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { #ifdef debug #pragma omp critical(cerr) { @@ -192,64 +202,70 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length // Cut our graph so that our reference path end points are graph tips. This will let the // snarl finder use the path to find telomeres. - path_handle_t sg_path_handle = subgraph.get_path_handle(region.seq); - Node* start_node = subgraph.get_node(mappings.begin()->node_id()); - auto sg_start_steps = path_steps_of_handle(subgraph, subgraph.get_handle(start_node->id()), sg_path_handle); + path_handle_t sg_path_handle = vg_subgraph->get_path_handle(region.seq); + Node* start_node = vg_subgraph->get_node(mappings.begin()->node_id()); + auto sg_start_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(start_node->id()), sg_path_handle); if (rewrite_paths && sg_start_steps.size() == 1) { - if (!mappings.begin()->is_reverse() && subgraph.start_degree(start_node) != 0) { - for (auto edge : subgraph.edges_to(start_node)) { + if (!mappings.begin()->is_reverse() && vg_subgraph->start_degree(start_node) != 0) { + for (auto edge : vg_subgraph->edges_to(start_node)) { #ifdef debug #pragma omp crticial(cerr) { cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; } #endif - subgraph.destroy_edge(edge); + vg_subgraph->destroy_edge(edge); } - } else if (mappings.begin()->is_reverse() && subgraph.end_degree(start_node) != 0) { - for (auto edge : subgraph.edges_from(start_node)) { + } else if (mappings.begin()->is_reverse() && vg_subgraph->end_degree(start_node) != 0) { + for (auto edge : vg_subgraph->edges_from(start_node)) { #ifdef debug #pragma omp crticial(cerr) { cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; } #endif - subgraph.destroy_edge(edge); + vg_subgraph->destroy_edge(edge); } } } - Node* end_node = subgraph.get_node(mappings.rbegin()->node_id()); - auto sg_end_steps = path_steps_of_handle(subgraph, subgraph.get_handle(end_node->id()), sg_path_handle); + Node* end_node = vg_subgraph->get_node(mappings.rbegin()->node_id()); + auto sg_end_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(end_node->id()), sg_path_handle); if (rewrite_paths && sg_end_steps.size() == 1) { - if (!mappings.rbegin()->is_reverse() && subgraph.end_degree(end_node) != 0) { - for (auto edge : subgraph.edges_from(end_node)) { + if (!mappings.rbegin()->is_reverse() && vg_subgraph->end_degree(end_node) != 0) { + for (auto edge : vg_subgraph->edges_from(end_node)) { #ifdef debug #pragma omp crticial(cerr) { cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; } #endif - subgraph.destroy_edge(edge); + vg_subgraph->destroy_edge(edge); } - } else if (mappings.rbegin()->is_reverse() && subgraph.start_degree(end_node) != 0) { - for (auto edge : subgraph.edges_to(end_node)) { + } else if (mappings.rbegin()->is_reverse() && vg_subgraph->start_degree(end_node) != 0) { + for (auto edge : vg_subgraph->edges_to(end_node)) { #ifdef debug #pragma omp crticial(cerr) { cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; } #endif - subgraph.destroy_edge(edge); + vg_subgraph->destroy_edge(edge); } } } // Sync our updated paths lists back into the Graph protobuf if (rewrite_paths) { - subgraph.paths.rebuild_node_mapping(); - subgraph.paths.rebuild_mapping_aux(); - subgraph.graph.clear_path(); - subgraph.paths.to_graph(subgraph.graph); + vg_subgraph->paths.rebuild_node_mapping(); + vg_subgraph->paths.rebuild_mapping_aux(); + vg_subgraph->graph.clear_path(); + vg_subgraph->paths.to_graph(vg_subgraph->graph); + } + + // copy back out of vg if necessary + if (dynamic_cast(&subgraph) == nullptr) { + convert_path_handle_graph(vg_subgraph, &subgraph); + delete vg_subgraph; } // start could fall inside a node. we find out where in the path the @@ -262,32 +278,22 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length } void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int context, int length, - bool forward_only, VG& subgraph, + bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region) { - Graph g; - for (vg::id_t i = start; i <= end; ++i) { - Node node; - node.set_id(i); - node.set_sequence(graph->get_sequence(graph->get_handle(i))); - *g.add_node() = node; + subgraph.create_handle(graph->get_sequence(graph->get_handle(i)), i); } - VG vg_g(g); - // expand the context and get path information // if forward_only true, then we only go forward. - algorithms::expand_subgraph_by_steps(*graph, vg_g, context, forward_only); + algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only); if (length) { - algorithms::expand_subgraph_by_length(*graph, vg_g, context, forward_only); + algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only); } - algorithms::add_subpaths_to_subgraph(*graph, vg_g); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); // build the vg - subgraph.extend(vg_g); - subgraph.remove_orphan_edges(); - out_region.start = subgraph.min_node_id(); out_region.end = subgraph.max_node_id(); } diff --git a/src/chunker.hpp b/src/chunker.hpp index 169d083e16b..42829542ee8 100644 --- a/src/chunker.hpp +++ b/src/chunker.hpp @@ -40,13 +40,13 @@ class PathChunker { * inclusive. * */ void extract_subgraph(const Region& region, int context, int length, bool forward_only, - VG& subgraph, Region& out_region); + MutablePathMutableHandleGraph& subgraph, Region& out_region); /** * Like above, but use (inclusive) id range instead of region on path. */ void extract_id_range(vg::id_t start, vg::id_t end, int context, int length, bool forward_only, - VG& subgraph, Region& out_region); + MutablePathMutableHandleGraph& subgraph, Region& out_region); /** * Get a set of all edges in the graph along a path region (to check for discontinuities later on) diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp index 9085ac5d3c1..ed9c1de3b9d 100644 --- a/src/io/save_handle_graph.hpp +++ b/src/io/save_handle_graph.hpp @@ -56,7 +56,26 @@ inline void save_handle_graph(HandleGraph* graph, const string& dest_path) { throw runtime_error("Internal error: unable to serialize graph"); } } - + +// Check that output format specifier is a valid graph type +inline bool valid_output_format(const string& fmt_string) { + return fmt_string == "vg" || fmt_string == "pg" || fmt_string == "hg"; +} + +// Create a new graph (of handle graph type T) where the implementation is chosen using the format string +template +T* new_output_graph(const string& fmt_string) { + if (fmt_string == "vg") { + return new VG(); + } else if (fmt_string == "pg") { + return new bdsg::PackedGraph(); + } else if (fmt_string == "hg") { + return new bdsg::HashGraph(); + } else { + return nullptr; + } +} + } } diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 225b9a20491..856d5e28bac 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -22,6 +22,8 @@ #include "../haplotype_extracter.hpp" #include "../algorithms/sorted_id_ranges.hpp" #include +#include "../io/save_handle_graph.hpp" +#include "convert_handle.hpp" using namespace std; using namespace vg; @@ -68,6 +70,7 @@ void help_chunk(char** argv) { << " -T, --trace trace haplotype threads in chunks (and only expand forward from input coordinates)." << endl << " Produces a .annotate.txt file with haplotype frequencies for each chunk." << endl << " -f, --fully-contained only return GAM alignments that are fully contained within chunk" << endl + << " -O, --output-fmt Specifiy output format (vg, pg, hg). [VG]" << endl << " -t, --threads N for tasks that can be done in parallel, use this many threads [1]" << endl << " -h, --help" << endl; } @@ -100,6 +103,7 @@ int main_chunk(int argc, char** argv) { bool fully_contained = false; int n_chunks = 0; size_t gam_split_size = 0; + string output_format = "vg"; int c; optind = 2; // force optind past command positional argument @@ -127,11 +131,12 @@ int main_chunk(int argc, char** argv) { {"n-chunks", required_argument, 0, 'n'}, {"context-length", required_argument, 0, 'l'}, {"gam-split-size", required_argument, 0, 'm'}, + {"output-fmt", required_argument, 0, 'O'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:", + c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:O:", long_options, &option_index); @@ -226,6 +231,10 @@ int main_chunk(int argc, char** argv) { threads = parse(optarg); break; + case 'O': + output_format = optarg; + break; + case 'h': case '?': help_chunk(argv); @@ -263,6 +272,13 @@ int main_chunk(int argc, char** argv) { } } + // check the output format + std::transform(output_format.begin(), output_format.end(), output_format.begin(), ::tolower); + if (!vg::io::valid_output_format(output_format)) { + cerr << "error[vg chunk]: invalid ouput format" << endl; + return 1; + } + // figure out which outputs we want. the graph always // needs to be chunked, even if only gam output is requested, // because we use the graph to get the nodes we're looking for. @@ -518,15 +534,16 @@ int main_chunk(int argc, char** argv) { int tid = omp_get_thread_num(); Region& region = regions[i]; PathChunker& chunker = chunkers[tid]; - VG* subgraph = NULL; + MutablePathMutableHandleGraph* subgraph = NULL; map trace_thread_frequencies; if (id_range == false) { - subgraph = new VG(); + subgraph = vg::io::new_output_graph(output_format); chunker.extract_subgraph(region, context_steps, context_length, trace, *subgraph, output_regions[i]); + } else { if (chunk_graph || context_steps > 0) { - subgraph = new VG(); + subgraph = vg::io::new_output_graph(output_format); output_regions[i].seq = region.seq; chunker.extract_id_range(region.start, region.end, context_steps, context_length, trace, @@ -556,9 +573,23 @@ int main_chunk(int argc, char** argv) { Graph g; trace_haplotypes_and_paths(*graph, *gbwt_index.get(), trace_start, trace_steps, g, trace_thread_frequencies, false); - subgraph->paths.for_each([&trace_thread_frequencies](const Path& path) { - trace_thread_frequencies[path.name()] = 1;}); - subgraph->extend(g); + subgraph->for_each_path_handle([&trace_thread_frequencies, &subgraph](path_handle_t path_handle) { + trace_thread_frequencies[subgraph->get_path_name(path_handle)] = 1;}); + VG* vg_subgraph = dynamic_cast(subgraph); + if (vg_subgraph != nullptr) { + // our graph is in vg format, just extend it + vg_subgraph->extend(g); + } else { + // our graph is not in vg format. covert it, extend it, convert it back + // this can eventually be avoided by handlifying the haplotype tracer + vg_subgraph = new VG(); + convert_path_handle_graph(subgraph, vg_subgraph); + delete subgraph; + vg_subgraph->extend(g); + subgraph = vg::io::new_output_graph(output_format); + convert_path_handle_graph(vg_subgraph, subgraph); + delete vg_subgraph; + } } ofstream out_file; @@ -580,8 +611,8 @@ int main_chunk(int argc, char** argv) { } out_stream = &out_file; } - - subgraph->serialize_to_ostream(*out_stream); + + vg::io::save_handle_graph(subgraph, *out_stream); } // optional gam chunking diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t index 6a59711bc9a..514313b22a6 100644 --- a/test/t/30_vg_chunk.t +++ b/test/t/30_vg_chunk.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 16 +plan tests 17 # Construct a graph with alt paths so we can make a gPBWT and later a GBWT vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg @@ -19,6 +19,9 @@ is $(vg chunk -x x.xg -p x -c 10| vg stats - -E) 291 "vg chunk with no options p # check a small chunk is $(vg chunk -x x.xg -p x:20-30 -c 0 | vg view - -j | jq -c '.path[0].mapping[].position' | jq 'select ((.node_id == "9"))' | grep node | sed s/,// | sort | uniq | wc -l) 1 "chunk has path going through node 9" +# check a small chunk, but using vg input and packed graph output +is $(vg chunk -x x.vg -p x:20-30 -c 0 -O pg | vg convert -v - | vg view - -j | jq -c '.path[0].mapping[].position' | jq 'select ((.node_id == "9"))' | grep node | sed s/,// | sort | uniq | wc -l) 1 "chunk has path going through node 9" + # check no crash when using chunk_size, and filenames deterministic rm -f _chunk_test* vg chunk -x x.xg -p x -s 233 -o 50 -b _chunk_test -c 0 -t 2 From 245690d254c82755c1f2d6b7262287e0905c61ff Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Nov 2019 13:32:33 -0500 Subject: [PATCH 63/79] add components chunking to chunk and deprecate explode --- src/algorithms/subgraph.cpp | 2 +- src/chunker.cpp | 28 +++++++- src/chunker.hpp | 18 +++-- src/subcommand/chunk_main.cpp | 119 +++++++++++++++++++++++--------- src/subcommand/explode_main.cpp | 4 ++ test/t/30_vg_chunk.t | 30 +++++++- 6 files changed, 159 insertions(+), 42 deletions(-) diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp index cc3e388e7b1..c7a11ae00ca 100644 --- a/src/algorithms/subgraph.cpp +++ b/src/algorithms/subgraph.cpp @@ -8,7 +8,7 @@ void expand_subgraph_by_steps(const HandleGraph& source, MutableHandleGraph& sub subgraph.for_each_handle([&](const handle_t& h) { curr_handles.push_back(h); }); - for (uint64_t i = 0; i < steps; ++i) { + for (uint64_t i = 0; i < steps && !curr_handles.empty(); ++i) { std::vector next_handles; for (auto& h : curr_handles) { handle_t old_h = source.get_handle(subgraph.get_id(h)); diff --git a/src/chunker.cpp b/src/chunker.cpp index 6bea18f6087..70990596f21 100644 --- a/src/chunker.cpp +++ b/src/chunker.cpp @@ -19,7 +19,7 @@ PathChunker::~PathChunker() { } -void PathChunker::extract_subgraph(const Region& region, int context, int length, bool forward_only, +void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region) { // This method still depends on VG // (not a super high priority to port, as calling can now be done at genome scale and we no longer @@ -277,7 +277,29 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length out_region.end = input_end_pos + graph->get_length(end_handle) + right_padding - 1; } -void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int context, int length, +void PathChunker::extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region) { + unordered_set path_ids; + + path_handle_t path_handle = graph->get_path_handle(path_name); + for (handle_t handle : graph->scan_path(path_handle)) { + path_ids.insert(graph->get_id(handle)); + } + + extract_component(path_ids, subgraph); + out_region.seq = path_name; +} + +void PathChunker::extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph) { + + for (nid_t node_id : node_ids) { + subgraph.create_handle(graph->get_sequence(graph->get_handle(node_id)), node_id); + } + + algorithms::expand_subgraph_by_steps(*graph, subgraph, numeric_limits::max()); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); +} + +void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region) { @@ -299,7 +321,7 @@ void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int context, in } set, pair>> PathChunker::get_path_edge_index(step_handle_t start_step, - step_handle_t end_step, int context) const { + step_handle_t end_step, int64_t context) const { // we don't use handles as we're going to use this structure to compare edges across different graphs set, pair>> path_edges; diff --git a/src/chunker.hpp b/src/chunker.hpp index 42829542ee8..02cc895a726 100644 --- a/src/chunker.hpp +++ b/src/chunker.hpp @@ -39,20 +39,30 @@ class PathChunker { * NOTE: we follow convention of Region coordinates being 0-based * inclusive. * */ - void extract_subgraph(const Region& region, int context, int length, bool forward_only, + void extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region); + /** + * Extract a connected component containing a given path + */ + void extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region); + + /** + * Extract a connected component starting from an id set + */ + void extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph); + /** * Like above, but use (inclusive) id range instead of region on path. */ - void extract_id_range(vg::id_t start, vg::id_t end, int context, int length, bool forward_only, - MutablePathMutableHandleGraph& subgraph, Region& out_region); + void extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, bool forward_only, + MutablePathMutableHandleGraph& subgraph, Region& out_region); /** * Get a set of all edges in the graph along a path region (to check for discontinuities later on) */ set, pair>> get_path_edge_index(step_handle_t start_step, - step_handle_t end_step, int context) const; + step_handle_t end_step, int64_t context) const; }; diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 856d5e28bac..eac1188e84f 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -21,6 +21,7 @@ #include "../region.hpp" #include "../haplotype_extracter.hpp" #include "../algorithms/sorted_id_ranges.hpp" +#include "../algorithms/weakly_connected_components.hpp" #include #include "../io/save_handle_graph.hpp" #include "convert_handle.hpp" @@ -29,7 +30,7 @@ using namespace std; using namespace vg; using namespace vg::subcommand; -static string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi = 0); +static string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi = 0, bool components = false); static int split_gam(istream& gam_stream, size_t chunk_size, const string& out_prefix, size_t gam_buffer_size = 100); @@ -43,12 +44,12 @@ void help_chunk(char** argv) { << "For a single-range chunk (-p or -r), the graph data is sent to standard output instead of a file." << endl << endl << "options:" << endl - << " -x, --xg-name FILE use this xg index to chunk subgraphs" << endl + << " -x, --xg-name FILE use this graph or xg index to chunk subgraphs" << endl << " -G, --gbwt-name FILE use this GBWT haplotype index for haplotype extraction" << endl << " -a, --gam-name FILE chunk this gam file (not stdin, sorted, with FILE.gai index) instead of the graph (multiple allowed)" << endl << " -g, --gam-and-graph when used in combination with -a, both gam and graph will be chunked" << endl << "path chunking:" << endl - << " -p, --path TARGET write the chunk in the specified (0-based inclusive)\n" + << " -p, --path TARGET write the chunk in the specified (0-based inclusive, multiple allowed)\n" << " path range TARGET=path[:pos1[-pos2]] to standard output" << endl << " -P, --path-list FILE write chunks for all path regions in (line - separated file). format" << endl << " for each as in -p (all paths chunked unless otherwise specified)" << endl @@ -59,6 +60,9 @@ void help_chunk(char** argv) { << " -n, --n-chunks N generate this many id-range chunks, which are determined using the xg index" << endl << "simple gam chunking:" << endl << " -m, --gam-split-size N split gam (specified with -a, sort/index not required) up into chunks with at most N reads each" << endl + << "component chunking:" << endl + << " -C, --components create a chunk for each connected component. if a targets given with (-p, -P, -r, -R), limit to components containing them" << endl + << " -M, --path-components create a chunk for each path in the graph's connected component" << endl << "general:" << endl << " -s, --chunk-size N create chunks spanning N bases (or nodes with -r/-R) for all input regions." << endl << " -o, --overlap N overlap between chunks when using -s [0]" << endl @@ -86,7 +90,7 @@ int main_chunk(int argc, char** argv) { string gbwt_file; vector gam_files; bool gam_and_graph = false; - string region_string; + vector region_strings; string path_list_file; int chunk_size = 0; int overlap = 0; @@ -104,6 +108,8 @@ int main_chunk(int argc, char** argv) { int n_chunks = 0; size_t gam_split_size = 0; string output_format = "vg"; + bool components = false; + bool path_components = false; int c; optind = 2; // force optind past command positional argument @@ -131,12 +137,14 @@ int main_chunk(int argc, char** argv) { {"n-chunks", required_argument, 0, 'n'}, {"context-length", required_argument, 0, 'l'}, {"gam-split-size", required_argument, 0, 'm'}, + {"components", no_argument, 0, 'C'}, + {"path-components", no_argument, 0, 'M'}, {"output-fmt", required_argument, 0, 'O'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:O:", + c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:CMO:", long_options, &option_index); @@ -164,7 +172,7 @@ int main_chunk(int argc, char** argv) { break; case 'p': - region_string = optarg; + region_strings.push_back(optarg); break; case 'P': @@ -219,6 +227,15 @@ int main_chunk(int argc, char** argv) { gam_split_size = parse(optarg); break; + case 'C': + components = true; + break; + + case 'M': + components = true; + path_components = true; + break; + case 'T': trace = true; break; @@ -249,9 +266,10 @@ int main_chunk(int argc, char** argv) { omp_set_num_threads(threads); // need at most one of -n, -p, -P, -e, -r, -R, -m as an input - if ((n_chunks == 0 ? 0 : 1) + (region_string.empty() ? 0 : 1) + (path_list_file.empty() ? 0 : 1) + (in_bed_file.empty() ? 0 : 1) + - (node_ranges_file.empty() ? 0 : 1) + (node_range_string.empty() ? 0 : 1) + (gam_split_size == 0 ? 0 : 1) > 1) { - cerr << "error:[vg chunk] at most one of {-n, -p, -P, -e, -r, -R, m} required to specify input regions" << endl; + if ((n_chunks == 0 ? 0 : 1) + (region_strings.empty() ? 0 : 1) + (path_list_file.empty() ? 0 : 1) + (in_bed_file.empty() ? 0 : 1) + + (node_ranges_file.empty() ? 0 : 1) + (node_range_string.empty() ? 0 : 1) + (gam_split_size == 0 ? 0 : 1) + + (path_components ? 1 : 0) > 1) { + cerr << "error:[vg chunk] at most one of {-n, -p, -P, -e, -r, -R, -m, -P} required to specify input regions" << endl; return 1; } // need -a if using -f @@ -259,6 +277,10 @@ int main_chunk(int argc, char** argv) { cerr << "error:[vg chunk] gam file must be specified with -a when using -f or -m" << endl; return 1; } + if (components == true && context_steps >= 0) { + cerr << "error:[vg chunk] context cannot be specified (-c) when splitting into components (-C)" << endl; + return 1; + } // context steps default to 1 if using id_ranges. otherwise, force user to specify to avoid // misunderstandings if (context_steps < 0 && gam_split_size == 0) { @@ -266,7 +288,7 @@ int main_chunk(int argc, char** argv) { if (!context_length) { context_steps = 1; } - } else { + } else if (!components){ cerr << "error:[vg chunk] context expansion steps must be specified with -c/--context when chunking on paths" << endl; return 1; } @@ -291,15 +313,15 @@ int main_chunk(int argc, char** argv) { unique_ptr path_handle_graph; bdsg::PathPositionOverlayHelper overlay_helper; - if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0)) { + if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0) || components) { if (xg_file.empty()) { - cerr << "error:[vg chunk] xg index (-x) required" << endl; + cerr << "error:[vg chunk] graph or xg index (-x) required" << endl; return 1; } ifstream in(xg_file.c_str()); if (!in) { - cerr << "error:[vg chunk] unable to load xg index file " << xg_file << endl; + cerr << "error:[vg chunk] unable to load graph / xg index file " << xg_file << endl; return 1; } @@ -337,10 +359,12 @@ int main_chunk(int argc, char** argv) { // parse the regions into a list vector regions; - if (!region_string.empty()) { - Region region; - parse_region(region_string, region); - regions.push_back(region); + if (!region_strings.empty()) { + for (auto& region_string : region_strings) { + Region region; + parse_region(region_string, region); + regions.push_back(region); + } } else if (!path_list_file.empty()) { ifstream pr_stream(path_list_file.c_str()); @@ -417,7 +441,7 @@ int main_chunk(int argc, char** argv) { delete range_stream; } } - else if (graph != nullptr) { + else if (graph != nullptr && (!components || path_components)) { // every path graph->for_each_path_handle([&](path_handle_t path_handle) { Region region; @@ -443,7 +467,7 @@ int main_chunk(int argc, char** argv) { region.start = max((int64_t)0, region.start); if (region.end == -1) { region.end = get_path_length(region.seq) - 1; - } else if (!id_range) { + } else if (!id_range && !components) { if (region.start < 0 || region.end >= get_path_length(region.seq)) { cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end << " is out of bounds of path " << region.seq << " which has length "<< get_path_length(region.seq) @@ -472,6 +496,20 @@ int main_chunk(int argc, char** argv) { swap(regions, chunked_regions); } + // when using -C for components, regions will be derived from the connected components + vector> component_ids; + if (components == true && regions.empty()) { + // no regions given, we find our components from scratch and make some dummy regions + component_ids = algorithms::weakly_connected_components(graph); + for (int i = 0; i < component_ids.size(); ++i) { + Region region; + region.seq = ""; + region.start = 0; + region.end = 0; + regions.push_back(region); + } + } + // now ready to get our chunk on if (gam_split_size != 0) { for (size_t gi = 0; gi < gam_files.size(); ++gi) { @@ -536,17 +574,26 @@ int main_chunk(int argc, char** argv) { PathChunker& chunker = chunkers[tid]; MutablePathMutableHandleGraph* subgraph = NULL; map trace_thread_frequencies; - if (id_range == false) { + if (!component_ids.empty()) { subgraph = vg::io::new_output_graph(output_format); - chunker.extract_subgraph(region, context_steps, context_length, - trace, *subgraph, output_regions[i]); - + chunker.extract_component(component_ids[i], *subgraph); + output_regions[i] = region; + } + else if (id_range == false) { + subgraph = vg::io::new_output_graph(output_format); + if (components == true) { + chunker.extract_path_component(region.seq, *subgraph, output_regions[i]); + } else { + chunker.extract_subgraph(region, context_steps, context_length, + trace, *subgraph, output_regions[i]); + } } else { if (chunk_graph || context_steps > 0) { subgraph = vg::io::new_output_graph(output_format); output_regions[i].seq = region.seq; chunker.extract_id_range(region.start, region.end, - context_steps, context_length, trace, + components ? numeric_limits::max() : context_steps, + context_length, trace && !components, *subgraph, output_regions[i]); } else { // in this case, there's no need to actually build the subgraph, so we don't @@ -595,7 +642,7 @@ int main_chunk(int argc, char** argv) { ofstream out_file; ostream* out_stream = NULL; if (chunk_graph) { - if ((!region_string.empty() || !node_range_string.empty()) && + if ((!region_strings.empty() || !node_range_string.empty()) && (regions.size() == 1) && chunk_size == 0) { // If we are going to output only one chunk, it should go to // stdout instead of to a file on disk @@ -603,7 +650,7 @@ int main_chunk(int argc, char** argv) { } else { // Otherwise, we write files under the specified prefix, using // a prefix-i-seq-start-end convention. - string name = chunk_name(out_chunk_prefix, i, output_regions[i], ".vg"); + string name = chunk_name(out_chunk_prefix, i, output_regions[i], "." + output_format, 0, components); out_file.open(name); if (!out_file) { cerr << "error[vg chunk]: can't open output chunk file " << name << endl; @@ -622,7 +669,7 @@ int main_chunk(int argc, char** argv) { assert(gam_index.get() != nullptr); GAMIndex::cursor_t& cursor = cursors_vec[gi][tid]; - string gam_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gam", gi); + string gam_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gam", gi, components); ofstream out_gam_file(gam_name); if (!out_gam_file) { cerr << "error[vg chunk]: can't open output gam file " << gam_name << endl; @@ -647,7 +694,7 @@ int main_chunk(int argc, char** argv) { if (trace) { // Even if we have only one chunk, the trace annotation data always // ends up in a file. - string annot_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".annotate.txt"); + string annot_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".annotate.txt", 0, components); ofstream out_annot_file(annot_name); if (!out_annot_file) { cerr << "error[vg chunk]: can't open output trace annotation file " << annot_name << endl; @@ -671,9 +718,9 @@ int main_chunk(int argc, char** argv) { const Region& oregion = output_regions[i]; string seq = id_range ? "ids" : oregion.seq; obed << seq << "\t" << oregion.start << "\t" << (oregion.end + 1) - << "\t" << chunk_name(out_chunk_prefix, i, oregion, chunk_gam ? ".gam" : ".vg"); + << "\t" << chunk_name(out_chunk_prefix, i, oregion, chunk_gam ? ".gam" : "." + output_format, 0, components); if (trace) { - obed << "\t" << chunk_name(out_chunk_prefix, i, oregion, ".annotate.txt"); + obed << "\t" << chunk_name(out_chunk_prefix, i, oregion, ".annotate.txt", 0, components); } obed << "\n"; } @@ -686,15 +733,21 @@ int main_chunk(int argc, char** argv) { static Subcommand vg_chunk("chunk", "split graph or alignment into chunks", main_chunk); // Output name of a chunk -string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi) { +string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi, bool components) { stringstream chunk_name; string seq = region.seq.empty() ? "ids" : region.seq; chunk_name << out_chunk_prefix; if (gi > 0) { chunk_name << "-" << gi; } - chunk_name << "_" << i << "_" << seq << "_" - << region.start << "_" << region.end << ext; + if (!components) { + chunk_name << "_" << i << "_" << seq << "_" << region.start << "_" << region.end; + } else if (region.seq.empty()) { + chunk_name << "_" << i; + } else { + chunk_name << "_" << region.seq; + } + chunk_name << ext; return chunk_name.str(); } diff --git a/src/subcommand/explode_main.cpp b/src/subcommand/explode_main.cpp index b0ef45ee86a..aca755f0e5c 100644 --- a/src/subcommand/explode_main.cpp +++ b/src/subcommand/explode_main.cpp @@ -71,6 +71,10 @@ int main_explode(int argc, char** argv) { } } + cerr << "vg explode is deprecated. Please use \"vg chunk -C source.vg -b part_dir/component\" for same* functionality as \"vg explode source.vg part_dir\"" << endl + << " * (unlike explode, the output directory must already exist when running chunk, though)" << endl; + return 1; + VG* graph; get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t index 514313b22a6..4a2e6a3b5c4 100644 --- a/test/t/30_vg_chunk.t +++ b/test/t/30_vg_chunk.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 17 +plan tests 20 # Construct a graph with alt paths so we can make a gPBWT and later a GBWT vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg @@ -61,3 +61,31 @@ is $(cat x.chunk/*vg | vg view -V - | grep -v P 2>/dev/null | sort | md5sum | c rm -rf x.sorted.gam x.sorted.gam.gai _chunk_test_bed.bed _chunk_test* x.chunk rm -f x.vg x.xg x.gbwt x.gam.json filter_chunk*.gam chunks.bed rm -f chunk_*.annotate.txt + +vg construct -r small/xy.fa -v small/xy.vcf.gz > xy.vg +vg construct -r small/xy.fa -v small/xy.vcf.gz -R x > x.vg +vg construct -r small/xy.fa -v small/xy.vcf.gz -R y > y.vg +# test that exploding into components works +vg chunk -x xy.vg -M -b path_chunk -O hg +vg view x.vg | grep "^S" | awk '{print $3}' | sort > x_nodes.txt +vg view y.vg | grep "^S" | awk '{print $3}' | sort > y_nodes.txt +vg convert path_chunk_x.hg -v | vg view - | grep "^S" | awk '{print $3}' | sort > pc_x_nodes.txt +vg convert path_chunk_y.hg -v | vg view - | grep "^S" | awk '{print $3}' | sort > pc_y_nodes.txt +diff x_nodes.txt pc_x_nodes.txt && diff y_nodes.txt pc_y_nodes.txt +is "$?" 0 "path-based components finds subgraphs" +vg paths -v x.vg -E > x_paths.txt +vg paths -v path_chunk_x.hg -E > pc_x_paths.txt +diff pc_x_paths.txt x_paths.txt +is "$?" 0 "path-based component contains correct path length" +vg chunk -x xy.vg -C -b components_chunk +vg view components_chunk_0.vg | grep "^S" | awk '{print $3}' > comp_0_nodes.txt +vg view components_chunk_1.vg | grep "^S" | awk '{print $3}' > comp_1_nodes.txt +cat comp_0_nodes.txt comp_1_nodes.txt | sort > comp_nodes.txt +cat x_nodes.txt y_nodes.txt | sort > nodes.txt +diff comp_nodes.txt nodes.txt +is "$?" 0 "components finds subgraphs" + +rm -f xy.vg x.vg y.vg x_nodes.txt y_nodes.txt convert path_chunk_x.hg convert path_chunk_y.hg pc_x_nodes.txt pc_y_nodes.txt x_paths.txt pc_x_paths.txt components_chunk_0.vg components_chunk_1.vg comp_0_nodes.txt comp_1_nodes.txt comp_nodes.txt nodes.txt + + + From b27cc3c902bd58345247137ad32b0dfcf7ed1878 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Nov 2019 13:37:53 -0500 Subject: [PATCH 64/79] skip alt paths --- src/subcommand/chunk_main.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index eac1188e84f..f6e01118a2d 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -446,7 +446,9 @@ int main_chunk(int argc, char** argv) { graph->for_each_path_handle([&](path_handle_t path_handle) { Region region; region.seq = graph->get_path_name(path_handle); - regions.push_back(region); + if (!Paths::is_alt(region.seq)) { + regions.push_back(region); + } }); } From 5e6c7fb9973ac8b976d351acada89ca81d3e7dfd Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Nov 2019 10:42:29 -0500 Subject: [PATCH 65/79] force dynamic schedule in parallel snarl iteration (for vg call) --- src/snarls.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snarls.cpp b/src/snarls.cpp index add633bb706..9c2e875fd5e 100644 --- a/src/snarls.cpp +++ b/src/snarls.cpp @@ -704,7 +704,7 @@ const vector& SnarlManager::top_level_snarls() const { } void SnarlManager::for_each_top_level_snarl_parallel(const function& lambda) const { -#pragma omp parallel for +#pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < roots.size(); i++) { lambda(roots[i]); } From f2bfe040dda73bd58c351ddc1413304f8e9fe9a2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Nov 2019 10:44:51 -0500 Subject: [PATCH 66/79] more careful about cycle scan --- src/graph_caller.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index f96de89b2fc..48c5047cf5c 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -40,7 +40,8 @@ void GraphCaller::call_top_level_snarls(bool recurse_on_fail) { std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); thread_queue.clear(); } -#pragma omp parallel for + +#pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < cur_queue.size(); ++i) { process_snarl(cur_queue[i]); } @@ -364,6 +365,7 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { function get_path_index; VG vg_graph; SupportBasedSnarlCaller& support_caller = dynamic_cast(snarl_caller); + bool was_called = false; if (is_vg) { // our graph is in VG format, so we've sorted this out in the constructor @@ -448,6 +450,8 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { // emit our vcf variant emit_variant(snarl, *rep_trav_finder, called_traversals, genotype, path_name); + + was_called = true; } } if (!is_vg) { @@ -458,7 +462,7 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) { } } - return true; + return was_called; } string LegacyCaller::vcf_header(const PathHandleGraph& graph, const vector& ref_paths, @@ -759,15 +763,16 @@ tuple LegacyCaller::get_ref_interval(const Snarl& snarl, c assert(start_steps.size() > 0 && end_steps.size() > 0); step_handle_t start_step = start_steps.begin()->second; step_handle_t end_step = end_steps.begin()->second; - bool scan_backward = graph.get_is_reverse(graph.get_handle_of_step(start_step)); + bool scan_backward = graph.get_is_reverse(graph.get_handle_of_step(start_step)) != snarl.start().backward(); // if we're on a cycle, we keep our start step and find the end step by scanning the path if (start_steps.size() > 1 || end_steps.size() > 1) { bool found_end = false; + if (scan_backward) { for (step_handle_t cur_step = start_step; graph.has_previous_step(end_step) && !found_end; cur_step = graph.get_previous_step(cur_step)) { - if (graph.get_handle_of_step(cur_step) == end_handle) { + if (graph.get_id(graph.get_handle_of_step(cur_step)) == graph.get_id(end_handle)) { end_step = cur_step; found_end = true; } @@ -776,7 +781,7 @@ tuple LegacyCaller::get_ref_interval(const Snarl& snarl, c } else { for (step_handle_t cur_step = start_step; graph.has_next_step(end_step) && !found_end; cur_step = graph.get_next_step(cur_step)) { - if (graph.get_handle_of_step(cur_step) == end_handle) { + if (graph.get_id(graph.get_handle_of_step(cur_step)) == graph.get_id(end_handle)) { end_step = cur_step; found_end = true; } From 1bfdc321cf6a1f3dc3a5d01223351b82606ccc2a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Nov 2019 12:52:17 -0500 Subject: [PATCH 67/79] sort before comparing vcf ouputs --- test/t/26_deconstruct.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 25070642c7f..c1fb4cf9045 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -108,11 +108,11 @@ printf "P\talt2.3\t1+,2+,4+,6+,8+,9+,11+,12+,14+,15+\t8M,1M,1M,3M,1M,19M,1M,4M,1 printf "P\talt2.3\t1+,2+,4+,6+,8+,9+,11+,12+,14+,15+\t8M,1M,1M,3M,1M,19M,1M,4M,1M,11M\n" >> tiny_names.gfa vg view -Fv tiny_names.gfa > tiny_names.vg vg index tiny_names.vg -x tiny_names.xg -vg deconstruct tiny_names.xg -P ref -A alt1,alt2 -e > tiny_names_decon.vcf +vg deconstruct tiny_names.xg -P ref -A alt1,alt2 -e | sort > tiny_names_decon.vcf is $(grep -v "#" tiny_names_decon.vcf | wc -l) 2 "-P -A options return correct number of variants" is $(grep -v "#" tiny_names_decon.vcf | grep ref.1 | wc -l) 2 "-P -A options use correct reference name" is $(grep -v "#" tiny_names_decon.vcf | grep ref.1 | grep 14 | grep "CONFLICT=alt1" | wc -l) 1 "-P -A identifies conflict in alt1 in second variant" -vg deconstruct tiny_names.vg -P ref -A alt1,alt2 -e > tiny_names_decon_vg.vcf +vg deconstruct tiny_names.vg -P ref -A alt1,alt2 -e | sort > tiny_names_decon_vg.vcf diff tiny_names_decon.vcf tiny_names_decon_vg.vcf is "$?" 0 "deconstructing vg graph gives same output as xg graph" From eb1da3fa345d145d9469a1496aca078b1b8f44b4 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Wed, 27 Nov 2019 13:17:27 -0800 Subject: [PATCH 68/79] add feature option --- src/subcommand/rna_main.cpp | 10 +++++++++- src/transcriptome.cpp | 4 ++-- src/transcriptome.hpp | 5 ++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp index 1d180231ded..2a5c1d8153a 100644 --- a/src/subcommand/rna_main.cpp +++ b/src/subcommand/rna_main.cpp @@ -23,6 +23,7 @@ void help_rna(char** argv) { cerr << "\nusage: " << argv[0] << " rna [options] > splice_graph.vg" << endl << "options:" << endl << " -n, --transcripts FILE transcript file(s) in gtf/gff format; may repeat (required)" << endl + << " -y, --feature-type NAME parse only this feature type in the gtf/gff (parse all if empty) [exon]" << endl << " -s, --transcript-tag NAME use this attribute tag in the gtf/gff file(s) as id [transcript_id]" << endl << " -l, --haplotypes FILE project transcripts onto haplotypes in GBWT index file" << endl << " -e, --use-embedded-paths project transcripts onto embedded graph paths" << endl @@ -50,6 +51,7 @@ int32_t main_rna(int32_t argc, char** argv) { } vector transcript_filenames; + string feature_type = "exon"; string transcript_tag = "transcript_id"; string haplotypes_filename; bool use_embedded_paths = false; @@ -73,6 +75,7 @@ int32_t main_rna(int32_t argc, char** argv) { static struct option long_options[] = { {"transcripts", no_argument, 0, 'n'}, + {"feature-type", no_argument, 0, 'y'}, {"transcript-tag", no_argument, 0, 's'}, {"haplotypes", no_argument, 0, 'l'}, {"use-embeded-paths", no_argument, 0, 'e'}, @@ -93,7 +96,7 @@ int32_t main_rna(int32_t argc, char** argv) { }; int32_t option_index = 0; - c = getopt_long(argc, argv, "n:s:l:ercdoraub:gf:i:t:ph?", long_options, &option_index); + c = getopt_long(argc, argv, "n:y:s:l:ercdoraub:gf:i:t:ph?", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -106,6 +109,10 @@ int32_t main_rna(int32_t argc, char** argv) { transcript_filenames.push_back(optarg); break; + case 'y': + feature_type = optarg; + break; + case 's': transcript_tag = optarg; break; @@ -219,6 +226,7 @@ int32_t main_rna(int32_t argc, char** argv) { if (show_progress) { cerr << "[vg rna] Graph (and index) parsed in " << gcsa::readTimer() - time_parsing_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; transcriptome.num_threads = num_threads; + transcriptome.feature_type = feature_type; transcriptome.transcript_tag = transcript_tag; transcriptome.use_embedded_paths = use_embedded_paths; transcriptome.use_reference_paths = (add_reference_transcript_paths || output_reference_transcript_paths); diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 7c7e4866677..d6f2edd9244 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -91,8 +91,8 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW transcript_stream.ignore(numeric_limits::max(), '\t'); getline(transcript_stream, feature, '\t'); - // Skip all non exon features, such as cds, gene etc. - if (feature != "exon") { + // Select only relevant feature types. + if (feature != feature_type && !feature_type.empty()) { transcript_stream.ignore(numeric_limits::max(), '\n'); continue; diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index 2ace89c60af..8d4a749f5be 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -87,8 +87,11 @@ class Transcriptome { /// Number of threads used for transcript path construction. int32_t num_threads = 1; + /// Feature type to parse in the gtf/gff file. Parse all types if empty. + string feature_type; + /// Attribute tag used to parse the transcript id/name in the gtf/gff file. - string transcript_tag = "transcript_id"; + string transcript_tag; /// Use all paths embedded in the graph for transcript path construction. bool use_embedded_paths = false; From 026c43d89851f296a55b5facf287d118d0a40b28 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Wed, 27 Nov 2019 13:51:30 -0800 Subject: [PATCH 69/79] added gff3 support --- src/transcriptome.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index d6f2edd9244..6e49b573cd5 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -57,8 +57,11 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW smatch regex_id_match; - // Regex used to extract transcript name/id. - regex regex_id_exp(transcript_tag + "\\s{1}\"?([^\"]*)\"?"); + // Regex used to extract transcript name/id from gtf file. + regex regex_id_exp_gtf(transcript_tag + "\\s{1}\"?([^\"]*)\"?;?"); + + // Regex used to extract transcript name/id from gff file. + regex regex_id_exp_gff(transcript_tag + "={1}([^;]*);?"); while (transcript_stream.good()) { @@ -121,8 +124,15 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW string transcript_id = ""; - // Get transcript name/id from attribute column using regex. - if (std::regex_search(attributes, regex_id_match, regex_id_exp)) { + // Get transcript name/id from gtf attribute column using regex. + if (std::regex_search(attributes, regex_id_match, regex_id_exp_gtf)) { + + assert(regex_id_match.size() == 2); + transcript_id = regex_id_match[1]; + } + + // Get transcript name/id from gff attribute column using regex. + if (std::regex_search(attributes, regex_id_match, regex_id_exp_gff)) { assert(regex_id_match.size() == 2); transcript_id = regex_id_match[1]; From 78d7e64187a1057921a4910ae1a30cb3be255670 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Wed, 27 Nov 2019 13:59:40 -0800 Subject: [PATCH 70/79] added error if no transcripts parsed --- src/transcriptome.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 6e49b573cd5..115ed93c7b9 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -163,9 +163,16 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW add_exon(&(transcripts.back()), make_pair(spos, epos), *chrom_path_index.second); } - reorder_exons(&transcripts.back()); + if (transcripts.empty()) { + + cerr << "[transcriptome] ERROR: No transcript where parsed (remember to set feature type \"-y\")" << endl; + exit(1); + } + delete chrom_path_index.second; + reorder_exons(&transcripts.back()); + #ifdef transcriptome_debug double time_parsing_2 = gcsa::readTimer(); cerr << "DEBUG parsing end: " << time_parsing_2 - time_parsing_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; From 15b1830a7d00abef4b2a1a6981553da1445041e0 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Wed, 27 Nov 2019 14:01:12 -0800 Subject: [PATCH 71/79] fixed typo --- src/transcriptome.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 115ed93c7b9..55854baa9ec 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -165,7 +165,7 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW if (transcripts.empty()) { - cerr << "[transcriptome] ERROR: No transcript where parsed (remember to set feature type \"-y\")" << endl; + cerr << "[transcriptome] ERROR: No transcripts parsed (remember to set feature type \"-y\")" << endl; exit(1); } From 1a8ae8247cf22bb4b9f7e16a36d3d8d0567a6e7f Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Tue, 3 Dec 2019 14:41:30 +0100 Subject: [PATCH 72/79] avoid compiler warnings with an updated bbhash --- .gitmodules | 4 ++-- deps/BBHash | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 1b80573d401..a4baa883612 100644 --- a/.gitmodules +++ b/.gitmodules @@ -122,6 +122,6 @@ [submodule "deps/mmmultimap"] path = deps/mmmultimap url = https://github.com/ekg/mmmultimap.git -[submodule "deps/BBHash"] +[submodule "vgteam_bbhash"] path = deps/BBHash - url = https://github.com/rizkg/BBHash.git + url = https://github.com/vgteam/BBHash.git diff --git a/deps/BBHash b/deps/BBHash index 88fba4e5014..36e4fe3eaee 160000 --- a/deps/BBHash +++ b/deps/BBHash @@ -1 +1 @@ -Subproject commit 88fba4e50149d2d05855df0994f668b0f82783f7 +Subproject commit 36e4fe3eaeef762c831c49cdc01f1a3a2c7a97a4 From 1f09169a0b6c4baf8ad8301f05cf42e08984daec Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Tue, 3 Dec 2019 14:52:28 +0100 Subject: [PATCH 73/79] implement BED-based kmer extraction in vg find --- src/subcommand/find_main.cpp | 41 ++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index 0cdcc690589..a9bacfc4b15 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -9,6 +9,7 @@ #include "../stream_index.hpp" #include "../algorithms/sorted_id_ranges.hpp" #include "../algorithms/approx_path_distance.hpp" +#include "../kmer.hpp" #include #include @@ -39,6 +40,7 @@ void help_find(char** argv) { << " -E, --path-dag with -p or -R, gets any node in the partial order from pos1 to pos2, assumes id sorted DAG" << endl << " -W, --save-to PREFIX instead of writing target subgraphs to stdout," << endl << " write one per given target to a separate file named PREFIX[path]:[start]-[end].vg" << endl + << " -K, --subgraph-k K instead of graphs, write kmers from the subgraphs" << endl << "alignments:" << endl << " -d, --db-name DIR use this RocksDB database to retrieve alignments" << endl << " -l, --sorted-gam FILE use this sorted, indexed GAM file" << endl @@ -111,6 +113,7 @@ int main_find(int argc, char** argv) { bool path_dag = false; string bed_targets_file; string save_to_prefix; + int subgraph_k = 0; int c; optind = 2; // force optind past command positional argument @@ -153,11 +156,12 @@ int main_find(int argc, char** argv) { {"min-mem", required_argument, 0, 'Z'}, {"paths-named", required_argument, 0, 'Q'}, {"list-paths", no_argument, 0, 'I'}, + {"subgraph-k", required_argument, 0, 'K'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:l:amg:M:B:fDG:N:A:Y:Z:IQ:ER:W:", + c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:l:amg:M:B:fDG:N:A:Y:Z:IQ:ER:W:K:", long_options, &option_index); // Detect the end of the options. @@ -309,6 +313,10 @@ int main_find(int argc, char** argv) { to_graph_file = optarg; break; + case 'K': + subgraph_k = atoi(optarg); + break; + case 'h': case '?': help_find(argv); @@ -643,8 +651,37 @@ int main_find(int argc, char** argv) { VG empty; graph = empty; } + if (subgraph_k) { + // enumerate the kmers, calculating including their start positions relative to the reference + // and write to stdout? + for_each_kmer(graph, subgraph_k, + [&](const kmer_t& kmer) { + // get the reference-relative position + string start_str, end_str; + for (auto& p : algorithms::nearest_offsets_in_paths(xindex, kmer.begin, subgraph_k*2)) { + const uint64_t& start_p = p.second.front().first; + const bool& start_rev = p.second.front().second; + if (p.first == path_handle && (!start_rev && start_p >= target.start || start_rev && start_p <= target.end)) { + start_str = target.seq + ":" + std::to_string(start_p) + (p.second.front().second ? "-" : "+"); + } + } + for (auto& p : algorithms::nearest_offsets_in_paths(xindex, kmer.end, subgraph_k*2)) { + const uint64_t& end_p = p.second.front().first; + const bool& end_rev = p.second.front().second; + if (p.first == path_handle && (!end_rev && end_p <= target.end || end_rev && end_p >= target.start)) { + end_str = target.seq + ":" + std::to_string(end_p) + (p.second.front().second ? "-" : "+"); + } + } + if (!start_str.empty() && !end_str.empty()) { + // write our record +#pragma omp critical (cout) + cout << target.seq << ":" << target.start << "-" << target.end << "\t" + << kmer.seq << "\t" << start_str << "\t" << end_str << std::endl; + } + }); + } } - if (save_to_prefix.empty()) { + if (save_to_prefix.empty() && !subgraph_k) { prep_graph(); graph.serialize_to_ostream(cout); } From 48e7dca839bcb105923c61d1f8764a8f691ef939 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Tue, 3 Dec 2019 14:56:18 +0100 Subject: [PATCH 74/79] add a basic test for the new BED based kmer extraction --- test/t/05_vg_find.t | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t index 3b0550a6c89..a7fc6d9dfbf 100644 --- a/test/t/05_vg_find.t +++ b/test/t/05_vg_find.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 27 +plan tests 28 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg is $? 0 "construction" @@ -107,6 +107,8 @@ is $((vg view t.x:30:35.vg; vg view t.x:10:20.vg) | wc -l) 20 "we can extract a echo x 30 36 | tr ' ' '\t' >t.bed echo x 10 21 | tr ' ' '\t' >>t.bed vg find -x t.xg -E -R t.bed -W q. -is $((vg view q.x:10:20.vg; vg view q.x:30:35.vg) | md5sum | cut -f 1 -d\ ) $((vg view t.x:10:20.vg ; vg view t.x:30:35.vg)| md5sum | cut -f 1 -d\ ) "the same extraction can be made using BEd input" +is $((vg view q.x:10:20.vg; vg view q.x:30:35.vg) | md5sum | cut -f 1 -d\ ) $((vg view t.x:10:20.vg ; vg view t.x:30:35.vg)| md5sum | cut -f 1 -d\ ) "the same extraction can be made using BED input" + +is $(vg find -x t.xg -E -p x:30-35 -p x:10-20 -K 5 | wc -l) 22 "we see the expected number of kmers in the given targets" rm -f t.xg t.vg t.x:30:35.vg t.x:10:20.vg q.x:30:35.vg q.x:10:20.vg t.bed From d7bdb0d0014a52c7d5c93788ae79a497e623f294 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Tue, 3 Dec 2019 11:35:20 -0800 Subject: [PATCH 75/79] handlifyed vg rna --- src/io/register_loader_saver_hash_graph.cpp | 2 +- src/io/register_loader_saver_odgi.cpp | 2 +- src/io/register_loader_saver_packed_graph.cpp | 2 +- src/io/register_loader_saver_vg.cpp | 2 +- src/path_index.cpp | 10 +- src/subcommand/rna_main.cpp | 8 +- src/transcriptome.cpp | 294 +++++++----------- src/transcriptome.hpp | 11 +- 8 files changed, 124 insertions(+), 207 deletions(-) diff --git a/src/io/register_loader_saver_hash_graph.cpp b/src/io/register_loader_saver_hash_graph.cpp index f2d4b1fdc10..fb8db9ad538 100644 --- a/src/io/register_loader_saver_hash_graph.cpp +++ b/src/io/register_loader_saver_hash_graph.cpp @@ -17,7 +17,7 @@ using namespace std; using namespace vg::io; void register_loader_saver_hash_graph() { - Registry::register_bare_loader_saver("HashGraph", [](istream& input) -> void* { + Registry::register_bare_loader_saver("HashGraph", [](istream& input) -> void* { // Allocate a HashGraph bdsg::HashGraph* hash_graph = new bdsg::HashGraph(); diff --git a/src/io/register_loader_saver_odgi.cpp b/src/io/register_loader_saver_odgi.cpp index 0bf21f22c86..97a0d3d1c6f 100644 --- a/src/io/register_loader_saver_odgi.cpp +++ b/src/io/register_loader_saver_odgi.cpp @@ -17,7 +17,7 @@ using namespace std; using namespace vg::io; void register_loader_saver_odgi() { - Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { + Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { // Allocate a PackedGraph bdsg::ODGI* odgi = new bdsg::ODGI(); diff --git a/src/io/register_loader_saver_packed_graph.cpp b/src/io/register_loader_saver_packed_graph.cpp index 510ead17723..4a7baeac7ca 100644 --- a/src/io/register_loader_saver_packed_graph.cpp +++ b/src/io/register_loader_saver_packed_graph.cpp @@ -17,7 +17,7 @@ using namespace std; using namespace vg::io; void register_loader_saver_packed_graph() { - Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { + Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { // Allocate a PackedGraph bdsg::PackedGraph* packed_graph = new bdsg::PackedGraph(); diff --git a/src/io/register_loader_saver_vg.cpp b/src/io/register_loader_saver_vg.cpp index a1554008df8..41630f5c2d2 100644 --- a/src/io/register_loader_saver_vg.cpp +++ b/src/io/register_loader_saver_vg.cpp @@ -17,7 +17,7 @@ using namespace vg::io; void register_loader_saver_vg() { // We register for "" so we can handle untagged old-style vg files and make them into HandleGraphs - Registry::register_loader_saver(vector{"VG", ""}, + Registry::register_loader_saver(vector{"VG", ""}, [](const message_sender_function_t& for_each_message) -> void* { // We have a bit of a control problem. // The source function wants to drive; we give it a function of strings, and it calls it with all the strings in turn. diff --git a/src/path_index.cpp b/src/path_index.cpp index e5ed83e3ca6..8e215a22e4c 100644 --- a/src/path_index.cpp +++ b/src/path_index.cpp @@ -267,14 +267,8 @@ PathIndex::PathIndex(const PathHandleGraph& graph, const string& path_name, bool assert(graph.has_path(path_name)); // Make a Protobuf path object - Path path; - for (handle_t handle : graph.scan_path(graph.get_path_handle(path_name))) { - Mapping* mapping = path.add_mapping(); - Position* position = mapping->mutable_position(); - position->set_node_id(graph.get_id(handle)); - position->set_is_reverse(graph.get_is_reverse(handle)); - } - + auto path = path_from_path_handle(graph, graph.get_path_handle(path_name)); + if (extract_sequence) { // Constructor dispatch hack *this = PathIndex(path, graph); diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp index 2a5c1d8153a..596d2bcb66b 100644 --- a/src/subcommand/rna_main.cpp +++ b/src/subcommand/rna_main.cpp @@ -28,7 +28,7 @@ void help_rna(char** argv) { << " -l, --haplotypes FILE project transcripts onto haplotypes in GBWT index file" << endl << " -e, --use-embedded-paths project transcripts onto embedded graph paths" << endl << " -c, --do-not-collapse do not collapse identical transcripts across haplotypes" << endl - << " -d, --remove-non-gene remove intergenic and intronic regions (removes reference paths if -a or -r)" << endl + << " -d, --remove-non-gene remove intergenic and intronic regions (removes reference paths)" << endl << " -o, --do-not-sort do not topological sort and compact splice graph" << endl << " -r, --add-ref-paths add reference transcripts as embedded paths in the splice graph" << endl << " -a, --add-non-ref-paths add non-reference transcripts as embedded paths in the splice graph" << endl @@ -201,6 +201,10 @@ int32_t main_rna(int32_t argc, char** argv) { return 1; } + if (remove_non_transcribed && !add_reference_transcript_paths && !add_non_reference_transcript_paths) { + + cerr << "[vg rna] WARNING: No haplotypes or paths were given for transcript projection. Use --haplotypes FILE and/or --use-embeded-paths." << endl; + } double time_parsing_start = gcsa::readTimer(); if (show_progress) { cerr << "[vg rna] Parsing graph file ..." << endl; } @@ -287,7 +291,7 @@ int32_t main_rna(int32_t argc, char** argv) { if (show_progress) { cerr << "[vg rna] Adding " << ((add_reference_transcript_paths) ? "reference" : "non-reference") << " transcript paths to splice graph ..." << endl; } } - transcriptome.embed_transcript_paths(add_reference_transcript_paths, add_non_reference_transcript_paths, false); + transcriptome.embed_transcript_paths(add_reference_transcript_paths, add_non_reference_transcript_paths); if (show_progress) { cerr << "[vg rna] Paths added in " << gcsa::readTimer() - time_add_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; } diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 55854baa9ec..1553900890c 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -3,22 +3,25 @@ #include "../algorithms/topological_sort.hpp" #include "../algorithms/apply_bulk_modifications.hpp" +#include "../io/save_handle_graph.hpp" #include "transcriptome.hpp" #include "../gbwt_helper.hpp" +#include "../augment.hpp" +#include "../utility.hpp" namespace vg { using namespace std; -// #define transcriptome_debug +#define transcriptome_debug Transcriptome::Transcriptome(const string & graph_filename, const bool show_progress) { // Load variation graph. get_input_file(graph_filename, [&](istream& in) { - _splice_graph = new VG(in, show_progress); + _splice_graph = vg::io::VPKG::load_one(in); }); if (!_splice_graph) { @@ -27,11 +30,6 @@ Transcriptome::Transcriptome(const string & graph_filename, const bool show_prog } } -Transcriptome::~Transcriptome() { - - delete _splice_graph; -} - void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBWT & haplotype_index) { #ifdef transcriptome_debug @@ -42,7 +40,14 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW vector transcripts; // Get mean length of nodes in the graph. - const float mean_node_length = _splice_graph->length() / static_cast(_splice_graph->size()); + double total_node_length = 0; + assert(_splice_graph->for_each_handle([&](const handle_t & handle) { + + total_node_length += _splice_graph->get_length(handle); + })); + + const float mean_node_length = total_node_length / _splice_graph->get_node_count(); + pair chrom_path_index("", nullptr); int32_t line_number = 0; @@ -75,7 +80,7 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW continue; } - if (!_splice_graph->paths.has_path(chrom)) { + if (!_splice_graph->has_path(chrom)) { cerr << "[transcriptome] ERROR: Chromomsome path \"" << chrom << "\" not found in graph (line " << line_number << ")." << endl; exit(1); @@ -86,7 +91,7 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW chrom_path_index.first = chrom; // Construct path index for chromosome/contig. - chrom_path_index.second = new PathIndex(*_splice_graph, chrom); + chrom_path_index.second = new PathIndex(*_splice_graph, chrom_path_index.first); } assert(chrom_path_index.second); @@ -452,7 +457,7 @@ list Transcriptome::project_transcript_gbwt(const Transcript & c for (auto & exon_node: haplotype.first.at(exon_idx)) { auto node_id = gbwt::Node::id(exon_node); - auto node_length = _splice_graph->get_node(node_id)->sequence().size(); + auto node_length = _splice_graph->get_length(_splice_graph->get_handle(node_id)); int32_t offset = 0; @@ -511,7 +516,7 @@ list Transcriptome::project_transcript_gbwt(const Transcript & c if (cur_transcript.is_reverse) { // Reverse complement transcript paths that are on the '-' strand. - reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_node(node_id)->sequence().size();}); + reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_length(_splice_graph->get_handle(node_id));}); } // Copy paths if collapse of identical transcript paths is not wanted. @@ -635,38 +640,45 @@ vector > Transcriptome::get_exon_haplotypes(con list Transcriptome::project_transcript_embedded(const Transcript & cur_transcript) const { - vector > *> exon_start_node_mappings; - vector > *> exon_end_node_mappings; + vector > exon_start_node_path_steps; + vector > exon_end_node_path_steps; - exon_start_node_mappings.reserve(cur_transcript.exon_border_nodes.size()); - exon_end_node_mappings.reserve(cur_transcript.exon_border_nodes.size()); + exon_start_node_path_steps.reserve(cur_transcript.exon_border_nodes.size()); + exon_end_node_path_steps.reserve(cur_transcript.exon_border_nodes.size()); // Get embedded path ids and node mappings for all exon border nodes in transcript. for (auto & exon_node: cur_transcript.exon_border_nodes) { - exon_start_node_mappings.emplace_back(&_splice_graph->paths.get_node_mapping(exon_node.first.node_id())); - exon_end_node_mappings.emplace_back(&_splice_graph->paths.get_node_mapping(exon_node.second.node_id())); + exon_start_node_path_steps.emplace_back(unordered_map()); + _splice_graph->for_each_step_on_handle(_splice_graph->get_handle(exon_node.first.node_id()), [&](const step_handle_t & step) { + assert(exon_start_node_path_steps.back().emplace(_splice_graph->get_path_handle_of_step(step), step).second); + }); + + exon_end_node_path_steps.emplace_back(unordered_map()); + _splice_graph->for_each_step_on_handle(_splice_graph->get_handle(exon_node.second.node_id()), [&](const step_handle_t & step) { + assert(exon_end_node_path_steps.back().emplace(_splice_graph->get_path_handle_of_step(step), step).second); + }); } list cur_transcript_paths; // Loop over all paths that contain the transcript start node. - for (auto & path_mapping_start: *exon_start_node_mappings.front()) { + for (auto & path_steps_start: exon_start_node_path_steps.front()) { // Skip path if transcript end node is not in the current path. - if (exon_end_node_mappings.back()->find(path_mapping_start.first) == exon_end_node_mappings.back()->end()) { + if (exon_end_node_path_steps.back().find(path_steps_start.first) == exon_end_node_path_steps.back().end()) { continue; } + const auto path_origin_name = _splice_graph->get_path_name(path_steps_start.first); + // Skip alternative allele paths (_alt). - if (Paths::is_alt(_splice_graph->paths.get_path_name(path_mapping_start.first))) { + if (Paths::is_alt(path_origin_name)) { continue; } - const auto path_origin_name = _splice_graph->paths.get_path_name(path_mapping_start.first); - // Only construct transcript paths originating from a reference chromosome/contig. if (path_origin_name != cur_transcript.chrom && !use_embedded_paths && use_reference_paths) { @@ -688,59 +700,43 @@ list Transcriptome::project_transcript_embedded(const Transcript bool is_partial = false; - mapping_t * haplotype_path_start_map = nullptr; - mapping_t * haplotype_path_end_map = nullptr; - - for (size_t exon_idx = 0; exon_idx < exon_start_node_mappings.size(); ++exon_idx) { - - auto haplotype_path_start_it = exon_start_node_mappings.at(exon_idx)->find(path_mapping_start.first); - auto haplotype_path_end_it = exon_end_node_mappings.at(exon_idx)->find(path_mapping_start.first); + for (size_t exon_idx = 0; exon_idx < exon_start_node_path_steps.size(); ++exon_idx) { - // Get path mapping at exon start if exon start node is in the current path. - if (haplotype_path_start_it != exon_start_node_mappings.at(exon_idx)->end()) { - - assert(haplotype_path_start_it->second.size() == 1); - haplotype_path_start_map = *haplotype_path_start_it->second.begin(); - } - - // Get path mapping at exon end if exon end node is in the current path. - if (haplotype_path_end_it != exon_end_node_mappings.at(exon_idx)->end()) { - - assert(haplotype_path_end_it->second.size() == 1); - haplotype_path_end_map = *haplotype_path_end_it->second.begin(); - } + auto haplotype_path_start_it = exon_start_node_path_steps.at(exon_idx).find(path_steps_start.first); + auto haplotype_path_end_it = exon_end_node_path_steps.at(exon_idx).find(path_steps_start.first); // Transcript paths are partial if either the start or end exon path - // mapping is empty. Partial transcripts are currently not supported. + // step is empty. Partial transcripts are currently not supported. // TODO: Add support for partial transcript paths. - if (!haplotype_path_start_map || !haplotype_path_end_map) { + if ((haplotype_path_start_it == exon_start_node_path_steps.at(exon_idx).end()) || haplotype_path_end_it == exon_end_node_path_steps.at(exon_idx).end()) { is_partial = true; break; } - bool is_first_mapping = true; + // Get path step at exon start if exon start node is in the current path. + auto haplotype_path_start_step = haplotype_path_start_it->second; - while (true) { + // Get path mapping at exon end if exon end node is in the current path. + auto haplotype_path_end_step = haplotype_path_end_it->second; - auto cur_node_id = haplotype_path_start_map->node_id(); - auto node_length = _splice_graph->get_node(cur_node_id)->sequence().size(); - assert(node_length == haplotype_path_start_map->length); + bool is_first_step = true; + while (true) { + + auto node_length = _splice_graph->get_length(_splice_graph->get_handle_of_step(haplotype_path_start_step)); int32_t offset = 0; // Adjust start position from exon border (last position in upstream intron) // to first position in exon. - if (is_first_mapping) { + if (is_first_step) { if (cur_transcript.exon_border_nodes.at(exon_idx).first.offset() + 1 == node_length) { - assert(haplotype_path_start_map != haplotype_path_end_map); - - haplotype_path_start_map = _splice_graph->paths.traverse_right(haplotype_path_start_map); - assert(haplotype_path_start_map); + assert(haplotype_path_start_step != haplotype_path_end_step); + haplotype_path_start_step = _splice_graph->get_next_step(haplotype_path_start_step); + is_first_step = false; - is_first_mapping = false; continue; } else { @@ -753,7 +749,7 @@ list Transcriptome::project_transcript_embedded(const Transcript // Adjust end position from exon border (first position in downstream intron) // to last position in exon. - if (haplotype_path_start_map == haplotype_path_end_map) { + if (haplotype_path_start_step == haplotype_path_end_step) { if (cur_transcript.exon_border_nodes.at(exon_idx).second.offset() == 0) { @@ -773,7 +769,7 @@ list Transcriptome::project_transcript_embedded(const Transcript auto new_mapping = cur_transcript_paths.back().path.add_mapping(); new_mapping->set_rank(cur_transcript_paths.back().path.mapping_size()); - new_mapping->mutable_position()->set_node_id(cur_node_id); + new_mapping->mutable_position()->set_node_id(_splice_graph->get_id(_splice_graph->get_handle_of_step(haplotype_path_start_step))); new_mapping->mutable_position()->set_offset(offset); new_mapping->mutable_position()->set_is_reverse(false); @@ -782,16 +778,11 @@ list Transcriptome::project_transcript_embedded(const Transcript new_edit->set_from_length(edit_length); new_edit->set_to_length(edit_length); - if (haplotype_path_start_map == haplotype_path_end_map) { break; } + if (haplotype_path_start_step == haplotype_path_end_step) { break; } - haplotype_path_start_map = _splice_graph->paths.traverse_right(haplotype_path_start_map); - assert(haplotype_path_start_map); - - is_first_mapping = false; + haplotype_path_start_step = _splice_graph->get_next_step(haplotype_path_start_step); + is_first_step = false; } - - haplotype_path_start_map = nullptr; - haplotype_path_end_map = nullptr; } if (is_partial) { @@ -806,7 +797,7 @@ list Transcriptome::project_transcript_embedded(const Transcript // Reverse complement transcript paths that are on the '-' strand. if (cur_transcript.is_reverse) { - reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_node(node_id)->sequence().size();}); + reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_length(_splice_graph->get_handle(node_id));}); } } } @@ -871,7 +862,7 @@ bool Transcriptome::add_novel_transcript_junctions(const list & auto & cur_mapping = transcript_path.path.mapping(i); - if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || !_splice_graph->has_node(cur_mapping.position().node_id()) || _splice_graph->get_node(cur_mapping.position().node_id())->sequence().size() != cur_mapping.edit(0).from_length()) { + if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || !_splice_graph->has_node(cur_mapping.position().node_id()) || _splice_graph->get_length(_splice_graph->get_handle(cur_mapping.position().node_id())) != cur_mapping.edit(0).from_length()) { all_junctions_added = false; i++; @@ -882,12 +873,12 @@ bool Transcriptome::add_novel_transcript_junctions(const list & auto & prev_mapping = transcript_path.path.mapping(i - 1); - auto prev_node_side = NodeSide(prev_mapping.position().node_id(), (prev_mapping.position().is_reverse() ? false : true)); - auto cur_node_side = NodeSide(cur_mapping.position().node_id(), (cur_mapping.position().is_reverse() ? true : false)); + auto prev_handle = _splice_graph->get_handle(prev_mapping.position().node_id(), prev_mapping.position().is_reverse()); + auto cur_handle = _splice_graph->get_handle(cur_mapping.position().node_id(), cur_mapping.position().is_reverse()); - if (!_splice_graph->has_edge(prev_node_side, cur_node_side)) { + if (!_splice_graph->has_edge(prev_handle, cur_handle)) { - _splice_graph->create_edge(prev_node_side, cur_node_side); + _splice_graph->create_edge(prev_handle, cur_handle); } } } @@ -930,9 +921,11 @@ void Transcriptome::add_paths_to_transcriptome(list * new_transc cerr << "DEBUG edit start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; #endif + stringstream gam_out_stream; + // Edit splice graph with projected transcript paths and // update path traversals to match the augmented graph. - _splice_graph->edit(edit_paths, nullptr, false, true, true); + augment(static_cast(_splice_graph.get()), edit_paths, nullptr, &gam_out_stream, false, true); #ifdef transcriptome_debug double time_edit_2 = gcsa::readTimer(); @@ -940,20 +933,15 @@ void Transcriptome::add_paths_to_transcriptome(list * new_transc #endif // Update projected transcript paths with new path traversals. - assert(edit_paths.size() == new_transcript_paths->size()); - auto new_transcript_paths_it = new_transcript_paths->begin(); - auto edit_paths_it = edit_paths.begin(); - - while (new_transcript_paths_it != new_transcript_paths->end()) { - - new_transcript_paths_it->path = move(*edit_paths_it); + + vg::io::for_each(gam_out_stream, [&](vg::Alignment & alignment) { + new_transcript_paths_it->path = move(alignment.path()); ++new_transcript_paths_it; - ++edit_paths_it; - } + }); - assert(edit_paths_it == edit_paths.end()); + assert(new_transcript_paths_it == new_transcript_paths->end()); } _transcript_paths.reserve(_transcript_paths.size() + new_transcript_paths->size()); @@ -975,26 +963,21 @@ int32_t Transcriptome::size() const { return _transcript_paths.size(); } -const VG & Transcriptome::splice_graph() const { +const MutablePathDeletableHandleGraph & Transcriptome::splice_graph() const { return *_splice_graph; } void Transcriptome::remove_non_transcribed(const bool new_reference_paths) { - // Save copy of embedded reference paths - Paths reference_paths; - if (new_reference_paths) { - - reference_paths = _splice_graph->paths; - } - // Remove non transcript paths. - _splice_graph->clear_paths(); + assert(_splice_graph->for_each_path_handle([&](const path_handle_t & path_handle) { + + _splice_graph->destroy_path(path_handle); + })); - // Find all nodes and edges that are in a transcript path. + // Find all nodes that are in a transcript path. unordered_set transcribed_nodes; - unordered_set > transcribed_edges; for (auto & transcript_path: _transcript_paths) { @@ -1004,125 +987,61 @@ void Transcriptome::remove_non_transcribed(const bool new_reference_paths) { for (size_t i = 1; i < transcript_path.path.mapping_size(); i++) { transcribed_nodes.emplace(transcript_path.path.mapping(i).position().node_id()); - transcribed_edges.emplace(transcript_path.path.mapping(i-1).position().node_id(), transcript_path.path.mapping(i).position().node_id()); } } - // Find all nodes that are not in a transcript path. + // Delete all nodes that are not in a transcript path. vector non_transcribed_nodes; - _splice_graph->for_each_node([&](const Node * node) { + assert(_splice_graph->for_each_handle([&](const handle_t & handle) { - if (transcribed_nodes.count(node->id()) == 0) { + if (transcribed_nodes.count(_splice_graph->get_id(handle)) == 0) { - non_transcribed_nodes.emplace_back(node->id()); + _splice_graph->destroy_handle(handle); } - }); - - for (auto & node: non_transcribed_nodes) { - - // Delete node and in/out edges. - _splice_graph->destroy_node(node); - } - - // Create new reference paths that only include trancribed nodes and edges. - if (new_reference_paths) { - - reference_paths.for_each([&](const Path & path) { - - if (!Paths::is_alt(path.name())) { - - vector new_paths; - - new_paths.emplace_back(Path()); - new_paths.back().set_name(path.name() + "_" + to_string(new_paths.size() - 1)); - - for (auto & mapping: path.mapping()) { - - auto cur_node_id = mapping.position().node_id(); - - if (new_paths.back().mapping_size() == 0) { - - if (transcribed_nodes.count(cur_node_id) > 0) { - - auto new_mapping = new_paths.back().add_mapping(); - *new_mapping = mapping; - new_mapping->set_rank(new_paths.back().mapping_size()); - } - - } else { - - auto prev_node_id = new_paths.back().mapping(new_paths.back().mapping_size() - 1).position().node_id(); - - // Extend new path, if transcribed edge (forward or reverse) exist between - // this and the previous node in the path. - if (transcribed_edges.count(make_pair(prev_node_id, cur_node_id)) > 0 || transcribed_edges.count(make_pair(cur_node_id, prev_node_id)) > 0) { - - auto new_mapping = new_paths.back().add_mapping(); - *new_mapping = mapping; - new_mapping->set_rank(new_paths.back().mapping_size()); - - } else { - - new_paths.emplace_back(Path()); - new_paths.back().set_name(path.name() + "_" + to_string(new_paths.size() - 1)); - } - } - } - - // Add new reference paths to graph without rebuild paths indexes. - _splice_graph->paths.extend(new_paths, false, false); - } - }); - - // Rebuild paths indexes. - _splice_graph->paths.compact_ranks(); - } + })); } void Transcriptome::compact_ordered() { // Find and apply topological ordering - auto topological_ordering = algorithms::topological_order(_splice_graph); - _splice_graph->apply_ordering(topological_ordering); + _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), false); - // Compact node ids and update embedded paths. - hash_map compacted_nodes; - _splice_graph->compact_ids(compacted_nodes); + // TODO: Compact nodes for other graph types + VG * vg_splice_graph = dynamic_cast(_splice_graph.get()); + if (vg_splice_graph != nullptr) { - // Update transcript paths with compacted node ids - for (auto & transcript_path: _transcript_paths) { - - for (auto & mapping: *transcript_path.path.mutable_mapping()) { + // Compact node ids and update embedded paths + hash_map compacted_nodes; + vg_splice_graph->compact_ids(compacted_nodes); - mapping.mutable_position()->set_node_id(compacted_nodes.at(mapping.position().node_id())); + // Update transcript paths with compacted node ids + for (auto & transcript_path: _transcript_paths) { + + for (auto & mapping: *transcript_path.path.mutable_mapping()) { + + mapping.mutable_position()->set_node_id(compacted_nodes.at(mapping.position().node_id())); + } } } } -void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths, const bool rebuild_indexes) { +void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths) { // Add transcript paths to graph for (auto & transcript_path: _transcript_paths) { assert(!transcript_path.haplotype_origins.empty() || !transcript_path.reference_origin.empty()); - transcript_path.path.set_name(transcript_path.name); + auto path_handle = _splice_graph->create_path_handle(transcript_path.name); - if (add_reference_paths && !transcript_path.reference_origin.empty()) { - - _splice_graph->paths.extend(transcript_path.path, false, false); - - } else if (add_non_reference_paths && !transcript_path.haplotype_origins.empty()) { + if ((add_reference_paths && !transcript_path.reference_origin.empty()) || (add_non_reference_paths && !transcript_path.haplotype_origins.empty())) { - _splice_graph->paths.extend(transcript_path.path, false, false); - } + for (auto & mapping: transcript_path.path.mapping()) { - transcript_path.path.set_name(""); - } - - // Rebuild paths indexes. - if (rebuild_indexes) { + auto handle = _splice_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse()); + _splice_graph->append_step(path_handle, handle); + } - _splice_graph->paths.compact_ranks(); + } } } @@ -1187,8 +1106,7 @@ void Transcriptome::write_sequences(ostream * fasta_ostream, const bool output_r if (!transcript_path.haplotype_origins.empty() || output_reference_transcripts) { // Write transcript path name and sequence. - *fasta_ostream << ">" << transcript_path.name << endl; - *fasta_ostream << _splice_graph->path_sequence(transcript_path.path) << endl; + write_fasta_sequence(transcript_path.name, path_sequence(*_splice_graph, transcript_path.path), cout); } } } @@ -1240,7 +1158,7 @@ void Transcriptome::write_info(ostream * tsv_ostream, const bool output_referenc void Transcriptome::write_splice_graph(ostream * vg_ostream) { - _splice_graph->serialize_to_ostream(*vg_ostream); + vg::io::save_handle_graph(_splice_graph.get(), *vg_ostream); } } diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index 8d4a749f5be..f9fa8effa77 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -7,6 +7,9 @@ #include #include +#include +#include +#include #include "../vg.hpp" #include "../path_index.hpp" @@ -82,7 +85,6 @@ class Transcriptome { public: Transcriptome(const string &, const bool); - ~Transcriptome(); /// Number of threads used for transcript path construction. int32_t num_threads = 1; @@ -113,7 +115,7 @@ class Transcriptome { int32_t size() const; /// Returns spliced variation graph. - const VG & splice_graph() const; + const MutablePathDeletableHandleGraph & splice_graph() const; /// Removes non-transcribed (not in transcript paths) nodes. /// Optionally create new reference paths that only include @@ -124,8 +126,7 @@ class Transcriptome { void compact_ordered(); /// Embeds transcript paths in spliced variation graph. - /// Optionally rebuild paths indexes. - void embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths, const bool rebuild_indexes); + void embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths); /// Add transcript paths as threads in GBWT index. void construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts, const bool add_bidirectional) const; @@ -148,7 +149,7 @@ class Transcriptome { vector _transcript_paths; /// Spliced variation graph. - VG * _splice_graph; + unique_ptr _splice_graph; /// Finds the position of each end of a exon on a path in the /// variation graph and adds the exon to a transcript. From 930e335a5179cc64705c0303cf2e7706496508c6 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Tue, 3 Dec 2019 13:48:18 -0800 Subject: [PATCH 76/79] fix warning --- src/subcommand/rna_main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp index 596d2bcb66b..529db98ea7e 100644 --- a/src/subcommand/rna_main.cpp +++ b/src/subcommand/rna_main.cpp @@ -28,7 +28,7 @@ void help_rna(char** argv) { << " -l, --haplotypes FILE project transcripts onto haplotypes in GBWT index file" << endl << " -e, --use-embedded-paths project transcripts onto embedded graph paths" << endl << " -c, --do-not-collapse do not collapse identical transcripts across haplotypes" << endl - << " -d, --remove-non-gene remove intergenic and intronic regions (removes reference paths)" << endl + << " -d, --remove-non-gene remove intergenic and intronic regions (deletes reference paths)" << endl << " -o, --do-not-sort do not topological sort and compact splice graph" << endl << " -r, --add-ref-paths add reference transcripts as embedded paths in the splice graph" << endl << " -a, --add-non-ref-paths add non-reference transcripts as embedded paths in the splice graph" << endl @@ -203,7 +203,7 @@ int32_t main_rna(int32_t argc, char** argv) { if (remove_non_transcribed && !add_reference_transcript_paths && !add_non_reference_transcript_paths) { - cerr << "[vg rna] WARNING: No haplotypes or paths were given for transcript projection. Use --haplotypes FILE and/or --use-embeded-paths." << endl; + cerr << "[vg rna] WARNING: Reference paths are deleted when removing intergenic and intronic regions. Consider adding transcripts as embedded paths using --add-ref-paths and/or --add-non-ref-paths." << endl; } double time_parsing_start = gcsa::readTimer(); From 689a03b24da6352200d938232f4d3bf995390b31 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Tue, 3 Dec 2019 15:14:21 -0800 Subject: [PATCH 77/79] compact ids for non-vg graphs --- src/transcriptome.cpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 1553900890c..9258419f80c 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -1003,13 +1003,13 @@ void Transcriptome::remove_non_transcribed(const bool new_reference_paths) { void Transcriptome::compact_ordered() { - // Find and apply topological ordering - _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), false); - - // TODO: Compact nodes for other graph types VG * vg_splice_graph = dynamic_cast(_splice_graph.get()); + if (vg_splice_graph != nullptr) { + // Find and apply topological ordering + _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), false); + // Compact node ids and update embedded paths hash_map compacted_nodes; vg_splice_graph->compact_ids(compacted_nodes); @@ -1022,6 +1022,21 @@ void Transcriptome::compact_ordered() { mapping.mutable_position()->set_node_id(compacted_nodes.at(mapping.position().node_id())); } } + + } else { + + // Add transcript paths to graph in order to compact ids of non-vg graphs. + // TODO: Find better solution. + embed_transcript_paths(true, true); + _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), true); + + for (auto & transcript_path: _transcript_paths) { + + auto path_handle = _splice_graph->get_path_handle(transcript_path.name); + transcript_path.path = path_from_path_handle(*_splice_graph, path_handle); + + _splice_graph->destroy_path(path_handle); + } } } @@ -1106,7 +1121,7 @@ void Transcriptome::write_sequences(ostream * fasta_ostream, const bool output_r if (!transcript_path.haplotype_origins.empty() || output_reference_transcripts) { // Write transcript path name and sequence. - write_fasta_sequence(transcript_path.name, path_sequence(*_splice_graph, transcript_path.path), cout); + write_fasta_sequence(transcript_path.name, path_sequence(*_splice_graph, transcript_path.path), *fasta_ostream); } } } From 6b463db4e1d3683ffaca2254b932112751d66bf4 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Tue, 3 Dec 2019 17:04:51 -0800 Subject: [PATCH 78/79] fix destroy_handle handle issue --- src/transcriptome.cpp | 52 +++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 9258419f80c..cb303be1b56 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -862,14 +862,12 @@ bool Transcriptome::add_novel_transcript_junctions(const list & auto & cur_mapping = transcript_path.path.mapping(i); - if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || !_splice_graph->has_node(cur_mapping.position().node_id()) || _splice_graph->get_length(_splice_graph->get_handle(cur_mapping.position().node_id())) != cur_mapping.edit(0).from_length()) { + if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || _splice_graph->get_length(_splice_graph->get_handle(cur_mapping.position().node_id())) != cur_mapping.edit(0).from_length()) { all_junctions_added = false; i++; - continue; - } - - if (i > 0) { + + } else if (i > 0) { auto & prev_mapping = transcript_path.path.mapping(i - 1); @@ -970,11 +968,21 @@ const MutablePathDeletableHandleGraph & Transcriptome::splice_graph() const { void Transcriptome::remove_non_transcribed(const bool new_reference_paths) { - // Remove non transcript paths. + vector path_handles; + path_handles.reserve(_splice_graph->get_path_count()); + assert(_splice_graph->for_each_path_handle([&](const path_handle_t & path_handle) { + path_handles.emplace_back(path_handle); + })); + + // Remove non transcript paths. + for (auto & path_handle: path_handles) { + _splice_graph->destroy_path(path_handle); - })); + } + + assert(_splice_graph->get_path_count() == 0); // Find all nodes that are in a transcript path. unordered_set transcribed_nodes; @@ -982,23 +990,31 @@ void Transcriptome::remove_non_transcribed(const bool new_reference_paths) { for (auto & transcript_path: _transcript_paths) { assert(transcript_path.path.mapping_size() > 0); - transcribed_nodes.emplace(transcript_path.path.mapping(0).position().node_id()); - - for (size_t i = 1; i < transcript_path.path.mapping_size(); i++) { + for (auto & mapping: transcript_path.path.mapping()) { - transcribed_nodes.emplace(transcript_path.path.mapping(i).position().node_id()); + transcribed_nodes.emplace(mapping.position().node_id()); } } - // Delete all nodes that are not in a transcript path. - vector non_transcribed_nodes; + vector non_transcribed_handles; + non_transcribed_handles.reserve(_splice_graph->get_node_count() - transcribed_nodes.size()); + + // Collect all nodes that are not in a transcript path. assert(_splice_graph->for_each_handle([&](const handle_t & handle) { if (transcribed_nodes.count(_splice_graph->get_id(handle)) == 0) { - _splice_graph->destroy_handle(handle); - } + non_transcribed_handles.emplace_back(handle); + } })); + + for (auto & handle: non_transcribed_handles) { + + // Delete node and in/out edges. + _splice_graph->destroy_handle(handle); + } + + assert(_splice_graph->get_node_count() == transcribed_nodes.size()); } void Transcriptome::compact_ordered() { @@ -1046,16 +1062,14 @@ void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const for (auto & transcript_path: _transcript_paths) { assert(!transcript_path.haplotype_origins.empty() || !transcript_path.reference_origin.empty()); - auto path_handle = _splice_graph->create_path_handle(transcript_path.name); if ((add_reference_paths && !transcript_path.reference_origin.empty()) || (add_non_reference_paths && !transcript_path.haplotype_origins.empty())) { + auto path_handle = _splice_graph->create_path_handle(transcript_path.name); for (auto & mapping: transcript_path.path.mapping()) { - auto handle = _splice_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse()); - _splice_graph->append_step(path_handle, handle); + _splice_graph->append_step(path_handle, _splice_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse())); } - } } } From 9ee2c4c4c858af8009bac2d87ef24643b4ae92c4 Mon Sep 17 00:00:00 2001 From: jonassibbesen Date: Tue, 3 Dec 2019 17:05:07 -0800 Subject: [PATCH 79/79] remove debug --- src/transcriptome.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index cb303be1b56..0db304b6cfc 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -14,7 +14,7 @@ namespace vg { using namespace std; -#define transcriptome_debug +// #define transcriptome_debug Transcriptome::Transcriptome(const string & graph_filename, const bool show_progress) {