From dd58c89449d03e6766f4878db3d3e4ebae3c7398 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Sat, 12 Oct 2019 14:12:42 -0700
Subject: [PATCH 01/79] Take two minimizers if only one seed

---
 src/minimizer_mapper.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp
index d1b87cf549b..fbbc89437b0 100644
--- a/src/minimizer_mapper.cpp
+++ b/src/minimizer_mapper.cpp
@@ -109,7 +109,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
         // of the selected minimizers is not high enough.
         size_t hits = minimizer_index.count(minimizers[minimizer_num]);
         
-        if (seeds.size() < 2 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) {
+        if (seeds.size() == 1 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) {
             // Locate the hits.
             for (auto& hit : minimizer_index.find(minimizers[minimizer_num])) {
                 // Reverse the hits for a reverse minimizer
@@ -317,7 +317,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
             }
             if (read_coverage_by_cluster[cluster_num] == curr_coverage &&
                 cluster_score[cluster_num] == curr_score &&
-                curr_kept < max_extensions / 2) {
+                curr_kept < max_extensions * 0.75) {
                 curr_kept++;
                 curr_count++;
             } else if (!read_coverage_by_cluster[cluster_num] == curr_coverage ||

From 4bffbd1cf393f20ff0ca75b9a2c95e21c5b277bb Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Sat, 12 Oct 2019 17:24:20 -0700
Subject: [PATCH 02/79] Cut off at hard hit cap

---
 src/minimizer_mapper.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp
index fbbc89437b0..8cc2ac2f0ec 100644
--- a/src/minimizer_mapper.cpp
+++ b/src/minimizer_mapper.cpp
@@ -109,14 +109,19 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
         // of the selected minimizers is not high enough.
         size_t hits = minimizer_index.count(minimizers[minimizer_num]);
         
-        if (seeds.size() == 1 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) {
+        if (seeds.size() <= 1 || hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) {
             // Locate the hits.
+            size_t added_hits = 0;
             for (auto& hit : minimizer_index.find(minimizers[minimizer_num])) {
                 // Reverse the hits for a reverse minimizer
                 if (minimizers[minimizer_num].is_reverse) {
                     size_t node_length = gbwt_graph.get_length(gbwt_graph.get_handle(id(hit)));
                     hit = reverse_base_pos(hit, node_length);
                 }
+                if (added_hits > hard_hit_cap) {
+                    //Take only up to hard_hit_cap
+                    continue;
+                }
                 // For each position, remember it and what minimizer it came from
                 seeds.push_back(hit);
                 seed_to_source.push_back(minimizer_num);

From c4b150e1fba28f941059797067761f8e752b46f4 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Thu, 17 Oct 2019 14:58:25 -0700
Subject: [PATCH 03/79] Changed default parameters

---
 scripts/giraffe-wrangler.sh   | 2 +-
 src/subcommand/gaffe_main.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh
index 9bce06a9bad..5dfaedb3083 100755
--- a/scripts/giraffe-wrangler.sh
+++ b/scripts/giraffe-wrangler.sh
@@ -91,7 +91,7 @@ echo "${SIM_GAM}"
 echo "${REAL_FASTQ}"
 
 # Define the Giraffe parameters
-GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 300 -a 4 -s 50 -u 0.3 -v 1 -w 20)
+GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 150 -a 4 -s 50 -u 0.4 -v 1 -w 20)
 
 # Define a work directory
 # TODO: this requires GNU mptemp
diff --git a/src/subcommand/gaffe_main.cpp b/src/subcommand/gaffe_main.cpp
index 8269e6233a3..fd882eaedba 100644
--- a/src/subcommand/gaffe_main.cpp
+++ b/src/subcommand/gaffe_main.cpp
@@ -358,13 +358,13 @@ int main_gaffe(int argc, char** argv) {
     // How many mappings per read can we emit?
     Range<size_t> max_multimaps = 1;
     // How many clusters should we extend?
-    Range<size_t> max_extensions = 300;
+    Range<size_t> max_extensions = 150;
     // How many extended clusters should we align, max?
     Range<size_t> max_alignments = 4;
     //Throw away cluster with scores that are this amount below the best
     Range<double> cluster_score = 50;
     //Throw away clusters with coverage this amount below the best 
-    Range<double> cluster_coverage = 0.3;
+    Range<double> cluster_coverage = 0.4;
     //Throw away extension sets with scores that are this amount below the best
     Range<double> extension_set = 20;
     //Throw away extensions with scores that are this amount below the best

From 7c42cd2bf3305abb14f4d3bfd9ef71487667d685 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Mon, 28 Oct 2019 09:25:53 -0700
Subject: [PATCH 04/79] Started paired end clusterer

---
 src/minimizer_mapper.cpp |  3 +--
 src/seed_clusterer.cpp   | 34 +++++++++++++++++++++------------
 src/seed_clusterer.hpp   | 41 ++++++++++++++++++++++++++++------------
 3 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp
index 7cfe7bd0ce6..5a028a137d1 100644
--- a/src/minimizer_mapper.cpp
+++ b/src/minimizer_mapper.cpp
@@ -187,8 +187,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
     }
         
     // Cluster the seeds. Get sets of input seed indexes that go together.
-    tuple<vector<vector<size_t>>,vector<vector<size_t>>> paired_clusters = clusterer.cluster_seeds(seeds, distance_limit);
-    vector<vector<size_t>> clusters = std::move(std::get<0>(paired_clusters));
+    vector<vector<size_t>> clusters paired_clusters = clusterer.cluster_seeds(seeds, distance_limit);
     
     if (track_provenance) {
         funnel.substage("score");
diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index 836288ee745..c05d75efbd7 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -10,8 +10,16 @@ namespace vg {
                                             dist_index(dist_index){
     };
 
-    tuple<vector<vector<size_t>>,vector<vector<size_t>>> SnarlSeedClusterer::cluster_seeds (
-                  vector<pos_t> seeds, int64_t read_distance_limit,
+    vector<vector<size_t>> cluster_seeds (vector<pos_t> seeds, int64_t read_distance_limit) const {
+        vector<vector<pos_t>> all_seeds;
+        all_seeds.push_back(std::move(seeds));
+        tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> all_clusters = 
+            cluster_seeds(all_seeds, distance_limit);
+        return std::get<0>(all_clusters)[0];
+    };
+
+    tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> SnarlSeedClusterer::cluster_seeds (
+                  vector<vector<pos_t>> all_seeds, int64_t read_distance_limit,
                   int64_t fragment_distance_limit) const {
         /* Given a vector of seeds and a limit, find a clustering of seeds where
          * seeds that are closer than the limit cluster together.
@@ -37,7 +45,7 @@ cerr << endl << "New cluster calculation:" << endl;
         //This stores all the tree relationships and cluster information
         //for a single level of the snarl tree as it is being processed
         //It also keeps track of the parents of the current level
-        TreeState tree_state (&seeds, read_distance_limit, fragment_distance_limit);
+        TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit);
 
         //Populate tree_state.node_to_seeds (mapping each node to the seeds it
         //contains) and snarl_to_nodes_by_level
@@ -108,25 +116,27 @@ cerr << endl << "New cluster calculation:" << endl;
                                                                snarl_to_nodes) const {
 
         // Assign each seed to a node.
-        tree_state.node_to_seeds.reserve(tree_state.seeds->size());
+        tree_state.node_to_seeds.reserve(tree_state.all_seeds->size());
         for (size_t i = 0; i < tree_state.seeds->size(); i++) {
-            id_t id = get_id(tree_state.seeds->at(i));
-            tree_state.node_to_seeds.emplace_back(id, i);
-            //For each seed, assign it to a node and the node to a snarl
+            for (size_t j = 0 ; j < tree_state.all_seeds[i].size() ; j++) {
+                id_t id = get_id(tree_state.all_seeds->at(i)->at(j));
+                tree_state.node_to_seeds.emplace_back(id, i, j);
+                //For each seed, assign it to a node and the node to a snarl
+            }
         }
         std::sort(tree_state.node_to_seeds.begin(), tree_state.node_to_seeds.end());
 
         // Assign each node to a snarl.
         id_t prev_node = -1;
         for (auto mapping : tree_state.node_to_seeds) {
-            if (mapping.first == prev_node) {
+            if (get<0>(mapping) == prev_node) {
                 continue;
             }
             prev_node = mapping.first;
-            size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first);
+            size_t snarl_i = dist_index.getPrimaryAssignment(get<0>(mapping));
             size_t depth = dist_index.snarl_indexes[snarl_i].depth;
             snarl_to_nodes[depth][snarl_i].emplace_back(
-                     NetgraphNode(mapping.first, NODE), NodeClusters());
+                     NetgraphNode(get<0>(mapping), NODE), NodeClusters());
         }
     }
 
@@ -271,7 +281,7 @@ cerr << endl << "New cluster calculation:" << endl;
         auto seed_range_start = std::lower_bound(
             tree_state.node_to_seeds.begin(),
             tree_state.node_to_seeds.end(),
-            std::pair<id_t, size_t>(node_id, 0));
+            std::tuple<id_t, size_t, size_t>(node_id, 0, 0));
 
         //indices of union find group ids of clusters in this node
         NodeClusters node_clusters;
@@ -287,7 +297,7 @@ cerr << endl << "New cluster calculation:" << endl;
                 //And find the shortest distance from any seed to both
                 //ends of the node
 
-                pos_t seed = tree_state.seeds->at(iter->second);
+                pos_t seed = tree_state.all_seeds->at(std::get<1>(*iter)).at(std::get<2>(*iter));
                 int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed)
                                                  : get_offset(seed) + 1;
                 int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1
diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp
index e7fad28642a..03c4d20346c 100644
--- a/src/seed_clusterer.hpp
+++ b/src/seed_clusterer.hpp
@@ -18,15 +18,24 @@ class SnarlSeedClusterer {
         //cluster the seeds such that two seeds whose minimum distance
         //between them (including both of the positions) is less than
         // the distance limit are in the same cluster
-        //If a fragment_distance_limit is give, then also cluster based on
-        //this distance for paired-end clusters. fragment_distance_limit
-        //must be greater than read_distance_limit
-        //If fragment_distance_limit is 0, then ignore it
+        //
         //Returns a vector of clusters. Each cluster is a vector of
         //indices into seeds
-        tuple<vector<vector<size_t>>,vector<vector<size_t>>> cluster_seeds ( 
-                vector<pos_t> seeds,
+        vector<vector<size_t>> cluster_seeds ( 
+                vector<pos_t> seeds, int64_t read_distance_limit) const;
+        
+        ///The same thing, but for paired end reads.
+        //Given seeds from multiple reads of a fragment, cluster each set of seeds
+        //by the read distance and all seeds by the fragment distance limit
+        //fragment_distance_limit must be greater than read_distance_limit
+        //Returns clusters for each read and clusters of all the seeds in all reads
+        //The read clusters refer to seeds by their indexes in the input vectors of seeds
+        //The fragment clusters give seeds the index they would get if the vectors of
+        // seeds were appended to each other in the order given
+        tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> cluster_seeds ( 
+                vector<vector<pos_t>> all_seeds,
                 int64_t read_distance_limit, int64_t fragment_distance_limit=0) const;
+
     private:
 
         MinimumDistanceIndex& dist_index;
@@ -117,8 +126,11 @@ class SnarlSeedClusterer {
             //As clustering occurs at the current level, the parent level
             //is updated to know about its children
 
-            //Vector of all the seeds
-            vector<pos_t>* seeds; 
+            //Vector of all the seeds for each read
+            vector<vector<pos_t>>* all_seeds; 
+
+            //Vector of the offset of indices for each seed
+            vector<size_t> seed_index_offsets;
 
             //The minimum distance between nodes for them to be put in the
             //same cluster
@@ -129,7 +141,7 @@ class SnarlSeedClusterer {
             //////////Data structures to hold clustering information
 
             //Structure to hold the clustering of the seeds
-            structures::UnionFind read_union_find;
+            vector<structures::UnionFind> read_union_find;
             structures::UnionFind fragment_union_find;
 
             //For each seed, store the distances to the left and right ends
@@ -146,7 +158,7 @@ class SnarlSeedClusterer {
             //Maps each node to a vector of the seeds that are contained in it
             //seeds are represented by indexes into the seeds vector
             //The array is sorted.
-            vector<pair<id_t, size_t>> node_to_seeds;
+            vector<tuple<id_t, size_t, size_t>> node_to_seeds;
 
             //Map from snarl (index into dist_index.snarl_indexes) i
             //to the netgraph nodes contained in the snarl as well as the 
@@ -172,14 +184,19 @@ class SnarlSeedClusterer {
                                                           parent_snarl_to_nodes;
 
             //Constructor takes in a pointer to the seeds and the distance limit 
-            TreeState (vector<pos_t>* seeds, int64_t read_distance_limit, 
+            TreeState (vector<vector<pos_t>>* all_seeds, int64_t read_distance_limit, 
                        int64_t fragment_distance_limit) :
-                seeds(seeds),
+                all_seeds(all_seeds),
                 read_cluster_dists(seeds->size(), make_pair(-1, -1)),
                 read_union_find (seeds->size(), false),
                 fragment_union_find (seeds->size(), false),
                 read_distance_limit(read_distance_limit),
                 fragment_distance_limit(fragment_distance_limit){
+                    seed_index_offsets.push_back(0);
+                    for (auto& v : all_seeds) {
+                        size_t offset = seed_index_offsets.back() + v.size();
+                        seed_index_offsets.push_back(offset);
+                    }
             }
         };
 

From 0b9f3dd5bb98db4959cabc770422560f5e57de1f Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 28 Oct 2019 15:51:32 -0400
Subject: [PATCH 05/79] Option for coverage threshold in augment

---
 src/augment.cpp                 | 158 ++++++++++++++++++++++++--------
 src/augment.hpp                 |  33 ++++++-
 src/packer.cpp                  |  18 +++-
 src/packer.hpp                  |  11 ++-
 src/subcommand/augment_main.cpp |  61 ++++++++----
 src/subcommand/pack_main.cpp    |   9 +-
 6 files changed, 224 insertions(+), 66 deletions(-)

diff --git a/src/augment.cpp b/src/augment.cpp
index bf725f87cb9..e38b2b2556b 100644
--- a/src/augment.cpp
+++ b/src/augment.cpp
@@ -3,6 +3,7 @@
 
 #include "augment.hpp"
 #include "alignment.hpp"
+#include "packer.hpp"
 
 //#define debug
 
@@ -18,15 +19,21 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool embed_paths,
              bool break_at_ends,
              bool remove_softclips,
-             bool filter_out_of_graph_alignments) {
+             bool filter_out_of_graph_alignments,
+             Packer* packer,
+             size_t min_bp_coverage) {
 
-    function<void(function<void(Alignment&)>, bool)> iterate_gam =
-        [&gam_stream] (function<void(Alignment&)> aln_callback, bool reset_stream) {
+    function<void(function<void(Alignment&)>, bool, bool)> iterate_gam =
+        [&gam_stream] (function<void(Alignment&)> aln_callback, bool reset_stream, bool parallel) {
         if (reset_stream) {
             gam_stream.clear();
             gam_stream.seekg(0, ios_base::beg);
         }
-        vg::io::for_each(gam_stream, aln_callback);
+        if (parallel) {
+            vg::io::for_each_parallel(gam_stream, aln_callback, Packer::estimate_batch_size(get_thread_count()));
+        } else {
+            vg::io::for_each(gam_stream, aln_callback);
+        }
     };
 
     augment_impl(graph,
@@ -36,7 +43,9 @@ void augment(MutablePathMutableHandleGraph* graph,
                  embed_paths,
                  break_at_ends,
                  remove_softclips,
-                 filter_out_of_graph_alignments);
+                 filter_out_of_graph_alignments,
+                 packer,
+                 min_bp_coverage);
 }
 
 void augment(MutablePathMutableHandleGraph* graph,
@@ -46,15 +55,30 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool embed_paths,
              bool break_at_ends,
              bool remove_softclips,
-             bool filter_out_of_graph_alignments) {
+             bool filter_out_of_graph_alignments,
+             Packer* packer,
+             size_t min_bp_coverage) {
     
-    function<void(function<void(Alignment&)>, bool)> iterate_gam =
-        [&path_vector] (function<void(Alignment&)> aln_callback, bool reset_stream) {
-        for (Path& path : path_vector) {
-            Alignment aln;
-            *aln.mutable_path() = path;
-            aln.set_name(path.name());
-            aln_callback(aln);
+    function<void(function<void(Alignment&)>, bool, bool)> iterate_gam =
+        [&path_vector] (function<void(Alignment&)> aln_callback, bool reset_stream, bool parallel) {
+        if (parallel) {
+#pragma omp parallel for
+            for (size_t i = 0; i < path_vector.size(); ++i) {
+                Path& path = path_vector[i];
+                Alignment aln;
+                *aln.mutable_path() = path;
+                aln.set_name(path.name());
+                aln_callback(aln);
+            }
+            
+        }
+        else {
+            for (Path& path : path_vector) {
+                Alignment aln;
+                *aln.mutable_path() = path;
+                aln.set_name(path.name());
+                aln_callback(aln);
+            }
         }
     };
 
@@ -65,20 +89,28 @@ void augment(MutablePathMutableHandleGraph* graph,
                  embed_paths,
                  break_at_ends,
                  remove_softclips,
-                 filter_out_of_graph_alignments);
+                 filter_out_of_graph_alignments,
+                 packer,
+                 min_bp_coverage);
 }
 
 void augment_impl(MutablePathMutableHandleGraph* graph,
-                  function<void(function<void(Alignment&)>, bool)> iterate_gam,
+                  function<void(function<void(Alignment&)>, bool, bool)> iterate_gam,
                   vector<Translation>* out_translations,
                   ostream* gam_out_stream,
                   bool embed_paths,
                   bool break_at_ends,
                   bool remove_softclips,
-                  bool filter_out_of_graph_alignments) {
-    // Collect the breakpoints
-    unordered_map<id_t, set<pos_t>> breakpoints;
+                  bool filter_out_of_graph_alignments,
+                  Packer* packer,
+                  size_t min_bp_coverage) {
 
+    // toggle between using Packer to store breakpoints or the STL map
+    bool packed_mode = min_bp_coverage > 0;
+    assert(!packed_mode || packer != nullptr);
+    
+    unordered_map<id_t, set<pos_t>> breakpoints;
+        
     // Check if alignment contains node that's not in the graph
     function<bool(const Path&)> check_in_graph = [&graph](const Path& path) {
         for (size_t i = 0; i < path.mapping_size(); ++i) {
@@ -106,14 +138,21 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
             // Mapping (because we don't have or want a breakpoint there)
             Path simplified_path = simplify(aln.path());
 
-
             // Add in breakpoints from each path
-            find_breakpoints(simplified_path, breakpoints, break_at_ends);
-
-        }, false);
-
-    // Invert the breakpoints that are on the reverse strand
-    breakpoints = forwardize_breakpoints(graph, breakpoints);
+            if (packed_mode) {
+                find_packed_breakpoints(simplified_path, *packer, break_at_ends);
+            } else {
+                find_breakpoints(simplified_path, breakpoints, break_at_ends);
+            }
+        }, false, packed_mode);
+
+    if (packed_mode) {
+        // Filter the breakpoints by coverage
+        unordered_map<id_t, set<pos_t>> breakpoints = filter_breakpoints_by_coverage(*packer, min_bp_coverage);
+    } else {
+        // Invert the breakpoints that are on the reverse strand
+        breakpoints = forwardize_breakpoints(graph, breakpoints);
+    }
 
     // get the node sizes, for use when making the translation
     unordered_map<id_t, size_t> orig_node_sizes;
@@ -187,7 +226,7 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
                 gam_buffer.push_back(aln);
                 vg::io::write_buffered(*gam_out_stream, gam_buffer, 100);
             }
-    }, true);
+        }, true, false);
     if (gam_out_stream != nullptr) {
         // Flush the buffer
         vg::io::write_buffered(*gam_out_stream, gam_buffer, 0);
@@ -235,7 +274,6 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 
 }
 
-
 // returns breakpoints on the forward strand of the nodes
 void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpoints, bool break_ends) {
     // We need to work out what offsets we will need to break each node at, if
@@ -332,15 +370,6 @@ void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpo
 
 }
 
-path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path) {
-    path_handle_t path_handle = graph->create_path_handle(path.name(), path.is_circular());
-    for (int i = 0; i < path.mapping_size(); ++i) {
-        graph->append_step(path_handle, graph->get_handle(path.mapping(i).position().node_id(),
-                                                          path.mapping(i).position().is_reverse()));
-    }
-    return path_handle;
-}
-
 unordered_map<id_t, set<pos_t>> forwardize_breakpoints(const HandleGraph* graph,
                                                        const unordered_map<id_t, set<pos_t>>& breakpoints) {
     unordered_map<id_t, set<pos_t>> fwd;
@@ -372,6 +401,63 @@ unordered_map<id_t, set<pos_t>> forwardize_breakpoints(const HandleGraph* graph,
     return fwd;
 }
 
+
+// returns breakpoints on the forward strand of the nodes
+void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends) {
+    // use existing methods to find the breakpoints, then copy them into a packer
+    // todo: streamline?
+    unordered_map<id_t, set<pos_t>> breakpoints;
+    find_breakpoints(path, breakpoints, break_ends);
+    breakpoints = forwardize_breakpoints(packed_breakpoints.get_graph(), breakpoints);
+    const HandleGraph* graph = packed_breakpoints.get_graph();
+    for (auto& id_set : breakpoints) {
+        size_t node_len = graph->get_length(graph->get_handle(id_set.first));
+        Position position;
+        position.set_node_id(id_set.first);
+        for (auto pos : id_set.second) {
+            size_t offset = get_offset(pos);
+            if (offset < node_len - 1) {
+                position.set_offset(offset);
+                packed_breakpoints.increment_coverage(packed_breakpoints.position_in_basis(position));
+            }
+        }
+    }
+}
+
+unordered_map<id_t, set<pos_t>> filter_breakpoints_by_coverage(const Packer& packed_breakpoints, size_t min_bp_coverage) {
+    vector<unordered_map<id_t, set<pos_t>>> bp_maps(get_thread_count());
+    size_t n = packed_breakpoints.coverage_size();
+    const VectorizableHandleGraph* vec_graph = dynamic_cast<const VectorizableHandleGraph*>(packed_breakpoints.get_graph());
+    // we assume our position vector is much larger than the number of filtered breakpoints
+    // and scan it in parallel in a first pass
+#pragma omp parallel for
+    for (size_t i = 0; i < n; ++i) {
+        if (packed_breakpoints.coverage_at_position(i) >= min_bp_coverage) {
+            auto& bp_map = bp_maps[omp_get_thread_num()];
+            nid_t node_id = vec_graph->node_at_vector_offset(i+1);
+            size_t offset = i - vec_graph->node_vector_offset(node_id);
+            bp_map[node_id].insert(make_pos_t(node_id, false, offset));
+        }
+    }
+    // then collect up the breakpoints sequentially in a second pass
+    for (size_t i = 1; i < bp_maps.size(); ++i) {
+        for (auto& kv : bp_maps[i]) {
+            bp_maps[0][kv.first].insert(kv.second.begin(), kv.second.end());
+        }
+    }
+    return bp_maps[0];
+}
+    
+
+path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path) {
+    path_handle_t path_handle = graph->create_path_handle(path.name(), path.is_circular());
+    for (int i = 0; i < path.mapping_size(); ++i) {
+        graph->append_step(path_handle, graph->get_handle(path.mapping(i).position().node_id(),
+                                                          path.mapping(i).position().is_reverse()));
+    }
+    return path_handle;
+}
+
 map<pos_t, id_t> ensure_breakpoints(MutableHandleGraph* graph,
                                     const unordered_map<id_t, set<pos_t>>& breakpoints) {
     // Set up the map we will fill in with the new node start positions in the
diff --git a/src/augment.hpp b/src/augment.hpp
index ac42760ca79..3e818003f38 100644
--- a/src/augment.hpp
+++ b/src/augment.hpp
@@ -11,7 +11,9 @@
 #include "handle.hpp"
 
 namespace vg {
-    
+
+class Packer;
+
 using namespace std;
 
 /// %Edit the graph to include all the sequence and edges added by the given
@@ -40,7 +42,9 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool embed_paths = false,
              bool break_at_ends = false,
              bool remove_soft_clips = false,
-             bool filter_out_of_graph_alignments = false);
+             bool filter_out_of_graph_alignments = false,
+             Packer* packer = nullptr,
+             size_t min_bp_coverage = 0);
 
 /// Like above, but operates on a vector of Alignments, instead of a stream
 /// (Note: It is best to use stream interface for large numbers of alignments to save memory)
@@ -51,23 +55,34 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool embed_paths = false,
              bool break_at_ends = false,
              bool remove_soft_clips = false,
-             bool filter_out_of_graph_alignments = false);
+             bool filter_out_of_graph_alignments = false,
+             Packer* packer = nullptr,
+             size_t min_bp_coverage = 0);
 
 /// Generic version used to implement the above two methods.  
 void augment_impl(MutablePathMutableHandleGraph* graph,
-                  function<void(function<void(Alignment&)>, bool)> iterate_gam,
+                  function<void(function<void(Alignment&)>, bool, bool)> iterate_gam,
                   vector<Translation>* out_translation,
                   ostream* gam_out_stream,
                   bool embed_paths,
                   bool break_at_ends,
                   bool remove_soft_clips,
-                  bool filter_out_of_graph_alignments);
+                  bool filter_out_of_graph_alignments,
+                  Packer* packer,
+                  size_t min_bp_coverage);
 
 /// Add a path to the graph.  This is like VG::extend, and expects
 /// a path with no edits, and for all the nodes and edges in the path
 /// to exist exactly in the graph
 path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path);
 
+/// Find all the points at which a Path enters or leaves nodes in the graph. Adds
+/// them to the given map by node ID of sets of bases in the node that will need
+/// to become the starts of new nodes.
+///
+/// If break_ends is true, emits breakpoints at the ends of the path, even
+/// if it starts/ends with perfect matches.
+
 /// Find all the points at which a Path enters or leaves nodes in the graph. Adds
 /// them to the given map by node ID of sets of bases in the node that will need
 /// to become the starts of new nodes.
@@ -80,6 +95,14 @@ void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpo
 unordered_map<id_t, set<pos_t>> forwardize_breakpoints(const HandleGraph* graph,
                                                        const unordered_map<id_t, set<pos_t>>& breakpoints);
 
+
+/// Like "find_breakpoints", but store in packed structure (better for large gams and enables coverage filter)
+void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends = true);
+
+/// Filters the breakpoints by coverage, and converts them back from the Packer to the STL map
+/// expected by following methods
+unordered_map<id_t, set<pos_t>> filter_breakpoints_by_coverage(const Packer& packed_breakpoints, size_t min_bp_coverage);
+
 /// Take a map from node ID to a set of offsets at which new nodes should
 /// start (which may include 0 and 1-past-the-end, which should be ignored),
 /// break the specified nodes at those positions. Returns a map from old
diff --git a/src/packer.cpp b/src/packer.cpp
index 38100274611..ca81cfcc19e 100644
--- a/src/packer.cpp
+++ b/src/packer.cpp
@@ -9,6 +9,22 @@ namespace vg {
 const int Packer::maximum_quality = 60;
 const int Packer::lru_cache_size = 4096;
 
+size_t Packer::estimate_data_width(size_t expected_coverage) {
+    return std::ceil(std::log2(2 * expected_coverage));
+}
+
+size_t Packer::estimate_batch_size(size_t num_threads) {
+    size_t batch_size = max((size_t)128, (size_t)(pow(2, 14 - log2(num_threads))));
+    if (batch_size % 2 != 0) {
+        ++batch_size;
+    }
+    return batch_size;
+}
+
+size_t Packer::estimate_bin_count(size_t num_threads) {
+    return pow(2, log2(num_threads) + 14);
+}
+
 Packer::Packer(void) : graph(nullptr), data_width(8), cov_bin_size(0), edge_cov_bin_size(0), num_bases_dynamic(0), base_locks(nullptr), num_edges_dynamic(0), edge_locks(nullptr), tmpfstream_locks(nullptr) { }
 
 Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins, size_t data_width, bool record_bases, bool record_edges, bool record_edits) :
@@ -527,7 +543,7 @@ string Packer::unescape_delim(const string& s, char d) const {
     return unescaped;
 }
 
-size_t Packer::coverage_size(void) {
+size_t Packer::coverage_size(void) const {
     if (is_compacted){
         return coverage_civ.size();
     }
diff --git a/src/packer.hpp b/src/packer.hpp
index 5d7ad5e9910..dfece198320 100644
--- a/src/packer.hpp
+++ b/src/packer.hpp
@@ -30,6 +30,12 @@ using namespace sdsl;
 /// In memory, the coverages are stored in SDSL int vectors (dynamic) and on disk they are compressed int vectors
 class Packer {
 public:
+    
+    /// Some helper functions to heuristically estimate input parameters for constructor
+    static size_t estimate_data_width(size_t expected_coverage);
+    static size_t estimate_batch_size(size_t num_threads);
+    static size_t estimate_bin_count(size_t num_threads);
+    
     Packer(void);
     /// Create a Packer
     /// graph : Must implement the VectorizableHandleGraph interface
@@ -75,7 +81,7 @@ class Packer {
     size_t get_n_bins(void) const;
     bool is_dynamic(void) const;
     const HandleGraph* get_graph() const;
-    size_t coverage_size(void);
+    size_t coverage_size(void) const ;
     void increment_coverage(size_t i);
     void increment_coverage(size_t i, size_t v);
 
@@ -84,7 +90,8 @@ class Packer {
     size_t edge_vector_size(void) const;
     size_t edge_index(const Edge& e) const;
     void increment_edge_coverage(size_t i);
-    void increment_edge_coverage(size_t i, size_t v);
+    void increment_edge_coverage(size_t i, size_t v);    
+   
 private:
     /// map from absolute postion to positions in the binned arrays
     pair<size_t, size_t> coverage_bin_offset(size_t i) const;
diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp
index 9b1959295d3..abd0cdaa586 100644
--- a/src/subcommand/augment_main.cpp
+++ b/src/subcommand/augment_main.cpp
@@ -25,12 +25,14 @@
 #include "../xg.hpp"
 #include "../vg.hpp"
 #include "../augment.hpp"
+#include "../packer.hpp"
 #include <vg/io/stream.hpp>
 #include <vg/io/vpkg.hpp>
 #include <handlegraph/mutable_path_mutable_handle_graph.hpp>
 #include "bdsg/packed_graph.hpp"
 #include "bdsg/hash_graph.hpp"
 #include "bdsg/odgi.hpp"
+#include <bdsg/overlay_helper.hpp>
 
 using namespace std;
 using namespace vg;
@@ -47,6 +49,8 @@ void help_augment(char** argv, ConfigurableParser& parser) {
          << "    -Z, --translation FILE      save translations from augmented back to base graph to FILE" << endl
          << "    -A, --alignment-out FILE    save augmented GAM reads to FILE" << endl
          << "    -s, --subgraph              graph is a subgraph of the one used to create GAM. ignore alignments with missing nodes" << endl
+         << "    -m, --min-coverage N        minimum coverage of a breakpoint required for it to be added to the graph" << endl
+         << "    -c, --expected-cov N        expected coverage.  used only for memory tuning [default : 128]" << endl
          << "    -h, --help                  print this help message" << endl
          << "    -p, --progress              show progress" << endl
          << "    -v, --verbose               print information and warnings about vcf generation" << endl
@@ -89,15 +93,19 @@ int main_augment(int argc, char** argv) {
     // fail when nodes are missing
     bool is_subgraph = false;
 
+    // Min coverage for graph to be broken at a breakpoint
+    // Whene non-zero, the Packer will be used to collect breakpoints
+    size_t min_coverage = 0;
+
+    // Used to set data_width for Packer
+    size_t expected_coverage = 128;
+
     // Print some progress messages to screen
     bool show_progress = false;
 
     // Print verbose message
     bool verbose = false;
 
-    // Number of threads to use (will default to all if not specified)
-    int thread_count = 0;
-
     static const struct option long_options[] = {
         // Deprecated Options
         {"augmentation-mode", required_argument, 0, 'a'},
@@ -108,6 +116,8 @@ int main_augment(int argc, char** argv) {
         {"cut-softclips", no_argument, 0, 'C'},
         {"label-paths", no_argument, 0, 'B'},
         {"subgraph", no_argument, 0, 's'},
+        {"min-coverage", required_argument, 0, 'm'},
+        {"expected-cov", required_argument, 0, 'c'},        
         {"help", no_argument, 0, 'h'},
         {"progress", required_argument, 0, 'p'},
         {"verbose", no_argument, 0, 'v'},
@@ -117,7 +127,7 @@ int main_augment(int argc, char** argv) {
         {"include-gt", required_argument, 0, 'L'},
         {0, 0, 0, 0}
     };
-    static const char* short_options = "a:Z:A:iCBhpvt:l:L:s";
+    static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:";
     optind = 2; // force optind past command positional arguments
 
     // This is our command-line parser
@@ -148,6 +158,12 @@ int main_augment(int argc, char** argv) {
         case 's':
             is_subgraph = true;
             break;
+        case 'm':
+            min_coverage = parse<size_t>(optarg);
+            break;
+        case 'c':
+            expected_coverage = parse<size_t>(optarg);
+            break;            
         case 'h':
         case '?':
             /* getopt_long already printed an error message. */
@@ -159,12 +175,18 @@ int main_augment(int argc, char** argv) {
             break;
         case 'v':
             verbose = true;
-            break;            
+            break;
         case 't':
-            thread_count = parse<int>(optarg);
+        {
+            int num_threads = parse<int>(optarg);
+            if (num_threads <= 0) {
+                cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
+                exit(1);
+            }
+            omp_set_num_threads(num_threads);
             break;
-
-            // Loci Options
+        }            
+        // Loci Options
         case 'l':
             loci_file = optarg;
             break;
@@ -181,12 +203,6 @@ int main_augment(int argc, char** argv) {
     // Parse the command line options, updating optind.
     parser.parse(argc, argv);
 
-    if (thread_count != 0) {
-        // Use a non-default number of threads
-        omp_set_num_threads(thread_count);
-    }
-    thread_count = get_thread_count();
-
     // Parse the two positional arguments
     if (optind + 1 > argc) {
         cerr << "[vg augment] error: too few arguments" << endl;
@@ -227,6 +243,15 @@ int main_augment(int argc, char** argv) {
             graph = vg::io::VPKG::load_one<MutablePathMutableHandleGraph>(in);
         });
     VG* vg_graph = dynamic_cast<VG*>(graph.get());
+    HandleGraph* vectorizable_graph = nullptr;
+    unique_ptr<Packer> packer;
+    bdsg::VectorizableOverlayHelper overlay_helper;
+    if (min_coverage > 0) {
+        vectorizable_graph = dynamic_cast<HandleGraph*>(overlay_helper.apply(graph.get()));
+        size_t data_width = Packer::estimate_data_width(expected_coverage);
+        size_t bin_count = Packer::estimate_bin_count(get_thread_count());
+        packer = make_unique<Packer>(vectorizable_graph, 0, bin_count, data_width, true, false, false);
+    }
     
     if (label_paths) {
         // Just add path names with extend()
@@ -302,7 +327,9 @@ int main_augment(int argc, char** argv) {
                     include_paths,
                     include_paths,
                     !include_softclips,
-                    is_subgraph);
+                    is_subgraph,
+                    packer.get(),
+                    min_coverage);
         } else {
             // much better to stream from a file so we can do two passes without storing in memory
             get_input_file(gam_in_file_name, [&](istream& alignment_stream) {
@@ -313,7 +340,9 @@ int main_augment(int argc, char** argv) {
                             include_paths,
                             include_paths,
                             !include_softclips,
-                            is_subgraph);
+                            is_subgraph,
+                            packer.get(),
+                            min_coverage);
                 });
         }
 
diff --git a/src/subcommand/pack_main.cpp b/src/subcommand/pack_main.cpp
index 79b8f5af8d6..fc75b273290 100644
--- a/src/subcommand/pack_main.cpp
+++ b/src/subcommand/pack_main.cpp
@@ -189,17 +189,14 @@ int main_pack(int argc, char** argv) {
 
     // get a data width from our expected coverage, using simple heuristic of counting
     // bits needed to store double the coverage
-    size_t data_width = std::ceil(std::log2(2 * expected_coverage));
+    size_t data_width = Packer::estimate_data_width(expected_coverage);
 
     // use some naive heuristics to come up with bin count and batch size based on thread count
     // more bins: finer grained parallelism at cost of more mutexes and allocations
     // bigger batch size: more robustness to sorted input at cost of less parallelism
     size_t num_threads = get_thread_count();
-    size_t batch_size = max((size_t)128, (size_t)(pow(2, 14 - log2(num_threads))));
-    if (batch_size % 2 != 0) {
-        ++batch_size;
-    }
-    size_t bin_count = pow(2, log2(num_threads) + 14);
+    size_t batch_size = Packer::estimate_batch_size(num_threads);
+    size_t bin_count = Packer::estimate_bin_count(num_threads);
 
     // create our packer
     Packer packer(graph, bin_size, bin_count, data_width, true, true, record_edits);

From 2696ec4a9cd9a3ea975b45357599ce2731fbbbd7 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 29 Oct 2019 13:59:05 -0400
Subject: [PATCH 06/79] fixes and tests for augment coverages threshold

---
 src/augment.cpp        | 140 ++++++++++++++++++++++++++++++++---------
 src/augment.hpp        |   6 ++
 src/packer.cpp         |  18 ++++--
 src/packer.hpp         |   3 +-
 test/t/17_vg_augment.t |  21 ++++++-
 5 files changed, 153 insertions(+), 35 deletions(-)

diff --git a/src/augment.cpp b/src/augment.cpp
index e38b2b2556b..468ab937d3b 100644
--- a/src/augment.cpp
+++ b/src/augment.cpp
@@ -148,12 +148,17 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 
     if (packed_mode) {
         // Filter the breakpoints by coverage
-        unordered_map<id_t, set<pos_t>> breakpoints = filter_breakpoints_by_coverage(*packer, min_bp_coverage);
+        breakpoints = filter_breakpoints_by_coverage(*packer, min_bp_coverage);
     } else {
         // Invert the breakpoints that are on the reverse strand
         breakpoints = forwardize_breakpoints(graph, breakpoints);
     }
 
+    // don't need this anymore: free up some memory
+    if (packer != nullptr) {
+        packer->clear();
+    }
+
     // get the node sizes, for use when making the translation
     unordered_map<id_t, size_t> orig_node_sizes;
     orig_node_sizes.reserve(graph->get_node_count());
@@ -190,6 +195,12 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
             // the input paths in memory
             Path simplified_path = simplify(aln.path());
 
+            // Filter out edits corresponding to breakpoints that didn't meet our coverage
+            // criteria
+            if (min_bp_coverage > 0) {
+                simplify_filtered_edits(graph, simplified_path, node_translation, orig_node_sizes);
+            }
+
             // Now go through each new path again, by reference so we can overwrite.
         
             // Create new nodes/wire things up. Get the added version of the path.
@@ -445,6 +456,7 @@ unordered_map<id_t, set<pos_t>> filter_breakpoints_by_coverage(const Packer& pac
             bp_maps[0][kv.first].insert(kv.second.begin(), kv.second.end());
         }
     }
+    
     return bp_maps[0];
 }
     
@@ -551,6 +563,100 @@ map<pos_t, id_t> ensure_breakpoints(MutableHandleGraph* graph,
     return toReturn;
 }
 
+// We use this function to get the id of the node that contains a position on an
+// original node.
+static nid_t find_new_node(HandleGraph* graph, pos_t old_pos, const map<pos_t, id_t>& node_translation) {
+    if(node_translation.find(make_pos_t(id(old_pos), false, 0)) == node_translation.end()) {
+        // The node is unchanged
+        return id(old_pos);
+    }
+    // Otherwise, get the first new node starting after that position, and
+    // then look left.
+    auto found = node_translation.upper_bound(old_pos);
+    assert(found != node_translation.end());
+    if (id(found->first) != id(old_pos)
+        || is_rev(found->first) != is_rev(old_pos)) {
+        return id_t(0);
+    }
+    // Get the thing before that (last key <= the position we want
+    --found;
+    assert(graph->has_node(found->second));
+
+    // Return the node we found.
+    return found->second;
+};
+
+
+void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
+                             const unordered_map<id_t, size_t>& orig_node_sizes) {
+
+    // check if an edit position is chopped at its next or prev position
+    auto is_chopped = [&](pos_t edit_position, bool look_next) {
+        // are we adding to the offset?
+        bool forward = look_next != is_rev(edit_position);
+        bool chopped = true;
+        if (forward) {
+            // check if its chopped in the original graph
+            chopped = offset(edit_position) == orig_node_sizes.find(id(edit_position))->second - 1;
+            // check if its chopped in the translation
+            if (!chopped) {
+                auto edit_next_position = edit_position;
+                ++get_offset(edit_next_position);
+                chopped = find_new_node(graph, edit_position, node_translation) != find_new_node(graph, edit_next_position, node_translation);
+            }
+        } else {
+            // check if its chopped in the original graph
+            chopped = offset(edit_position) == 0;
+            // check if its chopped in the translation
+            if (!chopped) {
+                auto edit_prev_position = edit_position;
+                --get_offset(edit_prev_position);
+                chopped = find_new_node(graph, edit_position, node_translation) != find_new_node(graph, edit_prev_position, node_translation);
+            }
+        }
+        return chopped;
+    };
+
+    bool path_modified = false;
+
+    for (size_t i = 0; i < path.mapping_size(); ++i) {
+        // For each Mapping in the path
+        Mapping& m = *path.mutable_mapping(i);
+
+        // What node are we on? In old node ID space.
+        id_t node_id = m.position().node_id();
+
+        // See where the next edit starts in the node. It is always included
+        // (even when the edit runs backward), unless the edit has 0 length in
+        // the reference.
+        pos_t edit_first_position = make_pos_t(m.position());
+
+        for(size_t j = 0; j < m.edit_size(); ++j) {
+            // For each Edit in the mapping
+            Edit& e = *m.mutable_edit(j);
+
+            // Work out where its end position on the original node is (inclusive)
+            // We don't use this on insertions, so 0-from-length edits don't matter.
+            pos_t edit_last_position = edit_first_position;
+            get_offset(edit_last_position) += (e.from_length()?e.from_length()-1:0);
+
+            // skip edits whose breakpoitns weren't added due to the coverage filter
+            if (!edit_is_match(e) && (!is_chopped(edit_first_position, true) || !is_chopped(edit_last_position, false))) {
+                e.set_to_length(e.from_length());
+                e.set_sequence("");
+                path_modified = true;
+            }
+
+            // Advance in the right direction along the original node for this edit.
+            // This way the next one will start at the right place.
+            get_offset(edit_first_position) += e.from_length();
+        }
+    }
+
+    if (path_modified) {
+        path = simplify(path);
+    }
+}
 
 Path add_nodes_and_edges(MutableHandleGraph* graph,
                          const Path& path,
@@ -572,6 +678,7 @@ Path add_nodes_and_edges(MutableHandleGraph* graph,
 
 }
 
+
 Path add_nodes_and_edges(MutableHandleGraph* graph,
                          const Path& path,
                          const map<pos_t, id_t>& node_translation,
@@ -604,36 +711,13 @@ Path add_nodes_and_edges(MutableHandleGraph* graph,
     Path embedded;
     embedded.set_name(path.name());
 
-    // We use this function to get the id of the node that contains a position on an
-    // original node.
-    auto find_new_node = [&](pos_t old_pos) {
-        if(node_translation.find(make_pos_t(id(old_pos), false, 0)) == node_translation.end()) {
-            // The node is unchanged
-            return id(old_pos);
-        }
-        // Otherwise, get the first new node starting after that position, and
-        // then look left.
-        auto found = node_translation.upper_bound(old_pos);
-        assert(found != node_translation.end());
-        if (id(found->first) != id(old_pos)
-            || is_rev(found->first) != is_rev(old_pos)) {
-            return id_t(0);
-        }
-        // Get the thing before that (last key <= the position we want
-        --found;
-        assert(graph->has_node(found->second));
-
-        // Return the node we found.
-        return found->second;
-    };
-
     auto create_new_mappings = [&](pos_t p1, pos_t p2, bool is_rev) {
         vector<Mapping> mappings;
         vector<id_t> nodes;
         for (pos_t p = p1; p <= p2; ++get_offset(p)) {
-            auto n = find_new_node(p);
+            auto n = find_new_node(graph, p, node_translation);
             assert(n != 0);
-            nodes.push_back(find_new_node(p));
+            nodes.push_back(find_new_node(graph, p, node_translation));
         }
         auto np = nodes.begin();
         while (np != nodes.end()) {
@@ -858,8 +942,8 @@ Path add_nodes_and_edges(MutableHandleGraph* graph,
                 // have additional breakpoints in the middle. So we need the
                 // left node, that contains the first base of the match, and the
                 // right node, that contains the last base of the match.
-                id_t left_node = find_new_node(edit_first_position);
-                id_t right_node = find_new_node(edit_last_position);
+                id_t left_node = find_new_node(graph, edit_first_position, node_translation);
+                id_t right_node = find_new_node(graph, edit_last_position, node_translation);
 
                 // TODO: we just assume the outer edges of these nodes are in
                 // the right places. They should be if we cut the breakpoints
diff --git a/src/augment.hpp b/src/augment.hpp
index 3e818003f38..9fc8f5ac4f4 100644
--- a/src/augment.hpp
+++ b/src/augment.hpp
@@ -117,6 +117,12 @@ unordered_map<id_t, set<pos_t>> filter_breakpoints_by_coverage(const Packer& pac
 map<pos_t, id_t> ensure_breakpoints(MutableHandleGraph* graph,
                                     const unordered_map<id_t, set<pos_t>>& breakpoints);
 
+/// Remove edits in our graph that don't correspond to breakpoints (ie were effectively filtered
+/// out due to insufficient coverage.  This way, subsequent logic in add_nodes_and_edges
+/// can be run correctly
+void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
+                             const unordered_map<id_t, size_t>& orig_node_sizes);
+
 /// Given a path on nodes that may or may not exist, and a map from start
 /// position in the old graph to a node in the current graph, add all the
 /// new sequence and edges required by the path. The given path must not
diff --git a/src/packer.cpp b/src/packer.cpp
index ca81cfcc19e..6f14e79dd46 100644
--- a/src/packer.cpp
+++ b/src/packer.cpp
@@ -76,23 +76,33 @@ Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins,
     }
 }
 
-Packer::~Packer(void) {
-    for (auto counter : coverage_dynamic) {
+void Packer::clear() {
+    for (auto& counter : coverage_dynamic) {
         delete counter;
+        counter = nullptr;
     }
-    for (auto counter : edge_coverage_dynamic) {
+    for (auto& counter : edge_coverage_dynamic) {
         delete counter;
+        counter = nullptr;
     }
     delete [] base_locks;
+    base_locks = nullptr;
     delete [] edge_locks;
+    edge_locks = nullptr;
     delete [] tmpfstream_locks;
+    tmpfstream_locks = nullptr;
     close_edit_tmpfiles();
     remove_edit_tmpfiles();
-    for (auto lru_cache : quality_cache) {
+    for (auto& lru_cache : quality_cache) {
         delete lru_cache;
+        lru_cache = nullptr;
     }
 }
 
+Packer::~Packer() {
+    clear();
+}
+
 void Packer::load_from_file(const string& file_name) {
     ifstream in(file_name);
     if (!in) {
diff --git a/src/packer.hpp b/src/packer.hpp
index dfece198320..7a02bb34f9c 100644
--- a/src/packer.hpp
+++ b/src/packer.hpp
@@ -47,7 +47,8 @@ class Packer {
     /// record_edges : Store the edge coverage
     /// record_edits : Store the edits
     Packer(const HandleGraph* graph, size_t bin_size = 0, size_t coverage_bins = 1, size_t data_width = 8, bool record_bases = true, bool record_edges = true, bool record_edits = true);
-    ~Packer(void);
+    ~Packer();
+    void clear();
 
     /// Add coverage from given alignment to the indexes
     /// aln : given alignemnt
diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t
index e252db9f274..0510fe2026f 100644
--- a/test/t/17_vg_augment.t
+++ b/test/t/17_vg_augment.t
@@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 PATH=../bin:$PATH # for vg
 
 
-plan tests 15
+plan tests 18
 
 vg view -J -v pileup/tiny.json > tiny.vg
 
@@ -18,7 +18,11 @@ vg augment -a direct tiny.vg edits.gam -A edits-embedded.gam > augmented.vg
 is "$(vg view -aj edits-embedded.gam | jq -c '.path.mapping[].edit[].sequence' | grep null | wc -l)" "36" "direct augmentation embeds reads fully for well-supported SNPs"
 is "$(vg stats -N augmented.vg)" "18" "adding a well-supported SNP by direct augmentation adds 3 more nodes"
 
-rm -f edits.gam edits-embedded.gam augmented.vg
+# Run again but with packed logic.  output should be identical with min threshold of 1
+vg augment -a direct tiny.vg edits.gam -A edits-embedded.gam -m 1 > augmented.m1.vg
+is "$(vg stats -N augmented.m1.vg)" "18" "adding a well-supported SNP by direct augmentation adds 3 more nodes with -m 1"
+
+rm -f edits.gam edits-embedded.gam augmented.vg augmented.m1.vg
 
 # Make sure every edit is augmented in
 vg view -J -a -G pileup/edit.json > edit.gam
@@ -74,6 +78,18 @@ vg index -x flat.xg -g flat.gcsa -k 16 flat.vg
 vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam
 is $(vg augment flat.vg 2snp.gam -i | vg mod -D - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes"
 
+vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTATCTGGAGTTCTATTATATCCCAACTCTCTG/' | vg view -Fv - >2err.vg
+vg sim -l 30 -x 2err.vg -n 10 -a >2err.sim
+vg map -g flat.gcsa -x flat.xg -G 2err.sim -k 8 >2err.gam
+cat 2snp.gam 2err.gam > 4edits.gam
+vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort >  2snp_default.nodes
+vg augment flat.vg 2snp.gam -m 1 | vg view - | grep S | awk '{print $3}' | sort >  2snp_m1.nodes
+diff 2snp_default.nodes 2snp_m1.nodes
+is "$?" 0 "augmenting 2 snps with -m 1 produces the same nodes as default"
+vg augment flat.vg 4edits.gam -m 11 | vg view - | grep S | awk '{print $3}' | sort > 4edits_m11.nodes
+diff 2snp_default.nodes 4edits_m11.nodes
+is "$?" 0 "augmenting 2 snps and 2 errors with -m 11 produces the same nodes as with just the snps"
+
 vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort > vg_augment.nodes
 vg convert flat.vg -p > flat.pg
 vg augment flat.pg 2snp.gam | vg convert -v - | vg view - | grep S | awk '{print $3}' | sort > packed_graph_augment.nodes
@@ -85,3 +101,4 @@ diff vg_augment.nodes hash_graph_augment.nodes
 is "$?" 0 "augmenting a hash graph produces same results as a vg graph"
 
 rm -f flat.vg flat.gcsa flat.xg flat.pg flat.hg 2snp.vg 2snp.xg 2snp.sim 2snp.gam vg_augment.nodes packed_graph_augment.nodes hash_graph_augment.nodes
+rm -f 2err.sim 2err.gam 4edits.gam 2snp_default.nodes 2snp_m1.nodes 4edits_m11.nodes

From c3de54c22f90e1723b7b2b2cc65a27a706494719 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 29 Oct 2019 14:39:12 -0400
Subject: [PATCH 07/79] dont process trivial paths

---
 src/augment.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/augment.cpp b/src/augment.cpp
index 468ab937d3b..b17eb3212e7 100644
--- a/src/augment.cpp
+++ b/src/augment.cpp
@@ -617,7 +617,8 @@ void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id
         return chopped;
     };
 
-    bool path_modified = false;
+    bool filtered_an_edit = false;
+    bool kept_an_edit = false;
 
     for (size_t i = 0; i < path.mapping_size(); ++i) {
         // For each Mapping in the path
@@ -641,10 +642,14 @@ void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id
             get_offset(edit_last_position) += (e.from_length()?e.from_length()-1:0);
 
             // skip edits whose breakpoitns weren't added due to the coverage filter
-            if (!edit_is_match(e) && (!is_chopped(edit_first_position, true) || !is_chopped(edit_last_position, false))) {
-                e.set_to_length(e.from_length());
-                e.set_sequence("");
-                path_modified = true;
+            if (!edit_is_match(e)) {
+                if (!is_chopped(edit_first_position, true) || !is_chopped(edit_last_position, false)) {
+                    e.set_to_length(e.from_length());
+                    e.set_sequence("");
+                    filtered_an_edit = true;
+                } else {
+                    kept_an_edit = true;
+                }
             }
 
             // Advance in the right direction along the original node for this edit.
@@ -653,7 +658,11 @@ void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id
         }
     }
 
-    if (path_modified) {
+    if (!kept_an_edit) {
+        // no edits in the path, let's zap it so we don't waste time scanning it again.
+        path.clear_mapping();
+    }  else if (filtered_an_edit) {
+        // there's something to simplify
         path = simplify(path);
     }
 }

From ec0c87fb8cf7d9f011e11193b440f2b44f798dff Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 29 Oct 2019 16:51:15 -0400
Subject: [PATCH 08/79] tests check that edits below coverage disappear in -A
 and -i output

---
 src/augment.cpp        | 71 ++++++++++++++++++++++--------------------
 src/augment.hpp        |  4 +--
 test/t/17_vg_augment.t | 10 ++++--
 3 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/src/augment.cpp b/src/augment.cpp
index b17eb3212e7..771d31e4784 100644
--- a/src/augment.cpp
+++ b/src/augment.cpp
@@ -197,45 +197,49 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 
             // Filter out edits corresponding to breakpoints that didn't meet our coverage
             // criteria
+            bool has_edits = true;
             if (min_bp_coverage > 0) {
-                simplify_filtered_edits(graph, simplified_path, node_translation, orig_node_sizes);
+                has_edits = simplify_filtered_edits(graph, simplified_path, node_translation, orig_node_sizes);
             }
 
             // Now go through each new path again, by reference so we can overwrite.
+            // but only if we have a reason to
+            if (has_edits || gam_out_stream != nullptr || embed_paths) {
         
-            // Create new nodes/wire things up. Get the added version of the path.
-            Path added = add_nodes_and_edges(graph, simplified_path, node_translation, added_seqs,
-                                             added_nodes, orig_node_sizes);
+                // Create new nodes/wire things up. Get the added version of the path.
+                Path added = add_nodes_and_edges(graph, simplified_path, node_translation, added_seqs,
+                                                 added_nodes, orig_node_sizes);
 
-            // Copy over the name
-            *added.mutable_name() = aln.name();
+                // Copy over the name
+                *added.mutable_name() = aln.name();
 
-            if (embed_paths) {
-                add_path_to_graph(graph, added);
-            }
+                if (embed_paths) {
+                    add_path_to_graph(graph, added);
+                }
 
-            // something is off about this check.
-            // assuming the GAM path is sorted, let's double-check that its edges are here
-            for (size_t i = 1; i < added.mapping_size(); ++i) {
-                auto& m1 = added.mapping(i-1);
-                auto& m2 = added.mapping(i);
-                // we're no longer sorting our input paths, so we assume they are sorted
-                assert((m1.rank() == 0 && m2.rank() == 0) || (m1.rank() + 1 == m2.rank()));
-                //if (!adjacent_mappings(m1, m2)) continue; // the path is completely represented here
-                auto s1 = graph->get_handle(m1.position().node_id(), m1.position().is_reverse());
-                auto s2 = graph->get_handle(m2.position().node_id(), m2.position().is_reverse());
-                // check that we always have an edge between the two nodes in the correct direction
-                if (!graph->has_edge(s1, s2)) {
-                    // force these edges in
-                    graph->create_edge(s1, s2);
+                // something is off about this check.
+                // assuming the GAM path is sorted, let's double-check that its edges are here
+                for (size_t i = 1; i < added.mapping_size(); ++i) {
+                    auto& m1 = added.mapping(i-1);
+                    auto& m2 = added.mapping(i);
+                    // we're no longer sorting our input paths, so we assume they are sorted
+                    assert((m1.rank() == 0 && m2.rank() == 0) || (m1.rank() + 1 == m2.rank()));
+                    //if (!adjacent_mappings(m1, m2)) continue; // the path is completely represented here
+                    auto s1 = graph->get_handle(m1.position().node_id(), m1.position().is_reverse());
+                    auto s2 = graph->get_handle(m2.position().node_id(), m2.position().is_reverse());
+                    // check that we always have an edge between the two nodes in the correct direction
+                    if (!graph->has_edge(s1, s2)) {
+                        // force these edges in
+                        graph->create_edge(s1, s2);
+                    }
                 }
-            }
 
-            // optionally write out the modified path to GAM
-            if (gam_out_stream != nullptr) {
-                *aln.mutable_path() = added;
-                gam_buffer.push_back(aln);
-                vg::io::write_buffered(*gam_out_stream, gam_buffer, 100);
+                // optionally write out the modified path to GAM
+                if (gam_out_stream != nullptr) {
+                    *aln.mutable_path() = added;
+                    gam_buffer.push_back(aln);
+                    vg::io::write_buffered(*gam_out_stream, gam_buffer, 100);
+                }
             }
         }, true, false);
     if (gam_out_stream != nullptr) {
@@ -587,7 +591,7 @@ static nid_t find_new_node(HandleGraph* graph, pos_t old_pos, const map<pos_t, i
 };
 
 
-void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
+bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
                              const unordered_map<id_t, size_t>& orig_node_sizes) {
 
     // check if an edit position is chopped at its next or prev position
@@ -658,13 +662,12 @@ void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id
         }
     }
 
-    if (!kept_an_edit) {
-        // no edits in the path, let's zap it so we don't waste time scanning it again.
-        path.clear_mapping();
-    }  else if (filtered_an_edit) {
+    if (filtered_an_edit) {
         // there's something to simplify
         path = simplify(path);
     }
+
+    return kept_an_edit;
 }
 
 Path add_nodes_and_edges(MutableHandleGraph* graph,
diff --git a/src/augment.hpp b/src/augment.hpp
index 9fc8f5ac4f4..8f5c1a51f4a 100644
--- a/src/augment.hpp
+++ b/src/augment.hpp
@@ -119,8 +119,8 @@ map<pos_t, id_t> ensure_breakpoints(MutableHandleGraph* graph,
 
 /// Remove edits in our graph that don't correspond to breakpoints (ie were effectively filtered
 /// out due to insufficient coverage.  This way, subsequent logic in add_nodes_and_edges
-/// can be run correctly
-void simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
+/// can be run correctly.  Returns true if at least one edit survived the filter.
+bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
                              const unordered_map<id_t, size_t>& orig_node_sizes);
 
 /// Given a path on nodes that may or may not exist, and a map from start
diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t
index 0510fe2026f..109927fe822 100644
--- a/test/t/17_vg_augment.t
+++ b/test/t/17_vg_augment.t
@@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 PATH=../bin:$PATH # for vg
 
 
-plan tests 18
+plan tests 21
 
 vg view -J -v pileup/tiny.json > tiny.vg
 
@@ -43,6 +43,12 @@ vg index -k 11 -g t.idx.gcsa -x t.idx.xg t.vg
 
 is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i | vg view - | grep ^S | wc -l) 1 "path inclusion does not modify the graph when alignment is a perfect match"
 
+is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGGAGTTCTAATATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i -m 2 | vg view - | grep ^S | wc -l) 1 "path inclusion does not modify the graph when alignment has a SNP but doesnt meet the coverage threshold"
+
+is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGGAGTTCTAATATATTCCAACTCTCTG -V read -d t.idx | vg augment t.vg - -i -m 2 -A read_aug.gam | vg view - | grep ^P | awk '{print $4}' | uniq) "50M" "path inclusion does not modify the included path when alignment has a SNP but doesnt meet the coverage threshold"
+
+is $(vg view -a read_aug.gam | jq. | grep edit | wc) 1 "output GAM has single edit when SNP was filtered out due to coverage"
+
 is $(vg map -s CAAATAAGGCTTGGAAAGGGTTTCTGGAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i | vg view - | grep ^S | wc -l) 5 "path inclusion with a complex variant introduces the right number of nodes"
 
 # checks that we get a node with the id 4, which is the ref-matching dual to the deletion
@@ -51,7 +57,7 @@ is $(vg map -s CAAAAAGGCTTGGAAAGGGTTTCTGGAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg
 is $(vg map -s CAAATAAGGCTTGGAAATTTTCTGCAGTTCTATTATATTCCAACTCTCTG -d t.idx | vg augment t.vg - -i | vg view - | grep ^S | wc -l) 4 "SNPs can be included in the graph"
 
 rm t.vg
-rm -rf t.idx.xg t.idx.gcsa
+rm -rf t.idx.xg t.idx.gcsa read_aug.gam
 
 vg construct -v tiny/tiny.vcf.gz -r tiny/tiny.fa >t.vg
 vg align -s GGGGGGGAAATTTTCTGGAGTTCTATTATATTCCAAAAAAAAAA t.vg >t.gam

From 66d4f4895d45c16ffe355de55fc026402d706f20 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 30 Oct 2019 14:00:05 -0400
Subject: [PATCH 09/79] quality filters for augment

---
 src/augment.cpp                 | 106 ++++++++++++++++++++++----------
 src/augment.hpp                 |  22 ++++++-
 src/subcommand/augment_main.cpp |  32 ++++++++--
 test/t/17_vg_augment.t          |  20 +++++-
 4 files changed, 138 insertions(+), 42 deletions(-)

diff --git a/src/augment.cpp b/src/augment.cpp
index 771d31e4784..dda25626556 100644
--- a/src/augment.cpp
+++ b/src/augment.cpp
@@ -20,6 +20,8 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool break_at_ends,
              bool remove_softclips,
              bool filter_out_of_graph_alignments,
+             double min_baseq,
+             double min_mapq,
              Packer* packer,
              size_t min_bp_coverage) {
 
@@ -44,6 +46,8 @@ void augment(MutablePathMutableHandleGraph* graph,
                  break_at_ends,
                  remove_softclips,
                  filter_out_of_graph_alignments,
+                 min_baseq,
+                 min_mapq,                 
                  packer,
                  min_bp_coverage);
 }
@@ -56,6 +60,8 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool break_at_ends,
              bool remove_softclips,
              bool filter_out_of_graph_alignments,
+             double min_baseq,
+             double min_mapq,
              Packer* packer,
              size_t min_bp_coverage) {
     
@@ -90,6 +96,8 @@ void augment(MutablePathMutableHandleGraph* graph,
                  break_at_ends,
                  remove_softclips,
                  filter_out_of_graph_alignments,
+                 min_baseq,
+                 min_mapq,
                  packer,
                  min_bp_coverage);
 }
@@ -102,11 +110,13 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
                   bool break_at_ends,
                   bool remove_softclips,
                   bool filter_out_of_graph_alignments,
+                  double min_baseq,
+                  double min_mapq,
                   Packer* packer,
                   size_t min_bp_coverage) {
 
     // toggle between using Packer to store breakpoints or the STL map
-    bool packed_mode = min_bp_coverage > 0;
+    bool packed_mode = min_bp_coverage > 0 || min_baseq > 0;
     assert(!packed_mode || packer != nullptr);
     
     unordered_map<id_t, set<pos_t>> breakpoints;
@@ -126,7 +136,7 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 #ifdef debug
             cerr << pb2json(aln.path()) << endl;
 #endif
-            if (filter_out_of_graph_alignments && !check_in_graph(aln.path())) {
+            if (aln.mapping_quality() < min_mapq || (filter_out_of_graph_alignments && !check_in_graph(aln.path()))) {
                 return;
             }
 
@@ -140,9 +150,11 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 
             // Add in breakpoints from each path
             if (packed_mode) {
-                find_packed_breakpoints(simplified_path, *packer, break_at_ends);
+                find_packed_breakpoints(simplified_path, *packer, break_at_ends, aln.quality(), min_baseq);
             } else {
-                find_breakpoints(simplified_path, breakpoints, break_at_ends);
+                // note: we cannot pass non-zero min_baseq here.  it relies on filter_breakpoints_by_coverage
+                // to work correctly, and must be passed in only via find_packed_breakpoints.
+                find_breakpoints(simplified_path, breakpoints, break_at_ends, "", 0);
             }
         }, false, packed_mode);
 
@@ -181,7 +193,7 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 
     // Second pass: add the nodes and edges
     iterate_gam((function<void(Alignment&)>)[&](Alignment& aln) {
-            if (filter_out_of_graph_alignments && !check_in_graph(aln.path())) {
+            if (aln.mapping_quality() < min_mapq || (filter_out_of_graph_alignments && !check_in_graph(aln.path()))) {
                 return;
             }
             
@@ -289,8 +301,21 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 
 }
 
+double get_avg_baseq(const Edit& edit, const string& base_quals, size_t position_in_read) {
+    double avg_qual = numeric_limits<int>::max();
+    if (!base_quals.empty() && !edit.sequence().empty() && (edit_is_sub(edit) || edit_is_insertion(edit))) {
+        double tot_qual = 0;
+        for (int i = 0; i < edit.sequence().length(); ++i) {
+            tot_qual += base_quals[position_in_read + i];
+        }
+        avg_qual = tot_qual / (double)edit.sequence().length();
+    }
+    return avg_qual;
+}
+
 // returns breakpoints on the forward strand of the nodes
-void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpoints, bool break_ends) {
+void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpoints, bool break_ends,
+                      const string& base_quals, double min_baseq) {
     // We need to work out what offsets we will need to break each node at, if
     // we want to add in all the new material and edges in this path.
 
@@ -298,6 +323,9 @@ void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpo
     cerr << "Processing path..." << endl;
 #endif
 
+    // The base position in the edit
+    size_t position_in_read = 0;
+
     for (size_t i = 0; i < path.mapping_size(); ++i) {
         // For each Mapping in the path
         const Mapping& m = path.mapping(i);
@@ -338,38 +366,41 @@ void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpo
             cerr << pb2json(e) << endl;
 #endif
 
-            if (!edit_is_match(e) || (j == 0 && (i != 0 || break_ends))) {
-                // If this edit is not a perfect match, or if this is the first
-                // edit in this mapping and either we had a previous mapping we
-                // may need to connect to or we want to break at the path's
-                // start, we need to make sure we have a breakpoint at the start
-                // of this edit.
+            // Do the base quality check if applicable.  If it fails we just ignore the edit
+            if (min_baseq == 0 || get_avg_baseq(e, base_quals, position_in_read) >= min_baseq) {
+                
+                if (!edit_is_match(e) || (j == 0 && (i != 0 || break_ends))) {
+                    // If this edit is not a perfect match, or if this is the first
+                    // edit in this mapping and either we had a previous mapping we
+                    // may need to connect to or we want to break at the path's
+                    // start, we need to make sure we have a breakpoint at the start
+                    // of this edit.
 
 #ifdef debug
-                cerr << "Need to break " << node_id << " at edit lower end " <<
-                    edit_first_position << endl;
+                    cerr << "Need to break " << node_id << " at edit lower end " <<
+                        edit_first_position << endl;
 #endif
 
-                // We need to snip between edit_first_position and edit_first_position - direction.
-                // Note that it doesn't matter if we put breakpoints at 0 and 1-past-the-end; those will be ignored.
-                breakpoints[node_id].insert(edit_first_position);
-            }
+                    // We need to snip between edit_first_position and edit_first_position - direction.
+                    // Note that it doesn't matter if we put breakpoints at 0 and 1-past-the-end; those will be ignored.
+                    breakpoints[node_id].insert(edit_first_position);
+                }
 
-            if (!edit_is_match(e) || (j == m.edit_size() - 1 && (i != path.mapping_size() - 1 || break_ends))) {
-                // If this edit is not a perfect match, or if it is the last
-                // edit in a mapping and we have a subsequent mapping we might
-                // need to connect to or we want to break at the path ends, make
-                // sure we have a breakpoint at the end of this edit.
+                if (!edit_is_match(e) || (j == m.edit_size() - 1 && (i != path.mapping_size() - 1 || break_ends))) {
+                    // If this edit is not a perfect match, or if it is the last
+                    // edit in a mapping and we have a subsequent mapping we might
+                    // need to connect to or we want to break at the path ends, make
+                    // sure we have a breakpoint at the end of this edit.
 
 #ifdef debug
-                cerr << "Need to break " << node_id << " at past edit upper end " <<
-                    edit_last_position << endl;
+                    cerr << "Need to break " << node_id << " at past edit upper end " <<
+                        edit_last_position << endl;
 #endif
 
-                // We also need to snip between edit_last_position and edit_last_position + direction.
-                breakpoints[node_id].insert(edit_last_position);
+                    // We also need to snip between edit_last_position and edit_last_position + direction.
+                    breakpoints[node_id].insert(edit_last_position);
+                }
             }
-
             // TODO: for an insertion or substitution, note that we need a new
             // node and two new edges.
 
@@ -380,6 +411,8 @@ void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpo
             // Use up the portion of the node taken by this mapping, so we know
             // where the next mapping will start.
             edit_first_position = edit_last_position;
+
+            position_in_read += e.to_length();
         }
     }
 
@@ -418,11 +451,12 @@ unordered_map<id_t, set<pos_t>> forwardize_breakpoints(const HandleGraph* graph,
 
 
 // returns breakpoints on the forward strand of the nodes
-void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends) {
+void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends,
+                             const string& base_quals, double min_baseq) {
     // use existing methods to find the breakpoints, then copy them into a packer
     // todo: streamline?
     unordered_map<id_t, set<pos_t>> breakpoints;
-    find_breakpoints(path, breakpoints, break_ends);
+    find_breakpoints(path, breakpoints, break_ends, base_quals, min_baseq);
     breakpoints = forwardize_breakpoints(packed_breakpoints.get_graph(), breakpoints);
     const HandleGraph* graph = packed_breakpoints.get_graph();
     for (auto& id_set : breakpoints) {
@@ -592,7 +626,8 @@ static nid_t find_new_node(HandleGraph* graph, pos_t old_pos, const map<pos_t, i
 
 
 bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
-                             const unordered_map<id_t, size_t>& orig_node_sizes) {
+                             const unordered_map<id_t, size_t>& orig_node_sizes,
+                             const string& base_quals, double min_baseq) {
 
     // check if an edit position is chopped at its next or prev position
     auto is_chopped = [&](pos_t edit_position, bool look_next) {
@@ -624,6 +659,9 @@ bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id
     bool filtered_an_edit = false;
     bool kept_an_edit = false;
 
+    // The base position in the edit
+    size_t position_in_read = 0;
+
     for (size_t i = 0; i < path.mapping_size(); ++i) {
         // For each Mapping in the path
         Mapping& m = *path.mutable_mapping(i);
@@ -646,8 +684,10 @@ bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id
             get_offset(edit_last_position) += (e.from_length()?e.from_length()-1:0);
 
             // skip edits whose breakpoitns weren't added due to the coverage filter
+            // or edits whose avg base quality fails the min_baseq filter
             if (!edit_is_match(e)) {
-                if (!is_chopped(edit_first_position, true) || !is_chopped(edit_last_position, false)) {
+                if (!is_chopped(edit_first_position, true) || !is_chopped(edit_last_position, false)
+                    || (min_baseq > 0 && get_avg_baseq(e, base_quals, position_in_read) < min_baseq)) {
                     e.set_to_length(e.from_length());
                     e.set_sequence("");
                     filtered_an_edit = true;
@@ -659,6 +699,8 @@ bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id
             // Advance in the right direction along the original node for this edit.
             // This way the next one will start at the right place.
             get_offset(edit_first_position) += e.from_length();
+
+            position_in_read += e.to_length();
         }
     }
 
diff --git a/src/augment.hpp b/src/augment.hpp
index 8f5c1a51f4a..2d7ce3456ed 100644
--- a/src/augment.hpp
+++ b/src/augment.hpp
@@ -35,6 +35,8 @@ using namespace std;
 /// before processing, and the dangling ends won't end up in the graph
 /// If filter_out_of_graph_alignments is true, some extra time will be taken to check if
 /// all nodes in the alignment are in the graph.  If they aren't, then it will be ignored
+/// If an edit sequence's avg base quality is less than min_baseq it will be ignored (considered a match)
+/// If an alignment's mapping quality
 void augment(MutablePathMutableHandleGraph* graph,
              istream& gam_stream,
              vector<Translation>* out_translation = nullptr,
@@ -43,6 +45,8 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool break_at_ends = false,
              bool remove_soft_clips = false,
              bool filter_out_of_graph_alignments = false,
+             double min_baseq = 0,
+             double min_mapq = 0,
              Packer* packer = nullptr,
              size_t min_bp_coverage = 0);
 
@@ -56,6 +60,8 @@ void augment(MutablePathMutableHandleGraph* graph,
              bool break_at_ends = false,
              bool remove_soft_clips = false,
              bool filter_out_of_graph_alignments = false,
+             double min_baseq = 0,
+             double min_mapq = 0,
              Packer* packer = nullptr,
              size_t min_bp_coverage = 0);
 
@@ -68,6 +74,8 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
                   bool break_at_ends,
                   bool remove_soft_clips,
                   bool filter_out_of_graph_alignments,
+                  double min_baseq,
+                  double min_mapq,
                   Packer* packer,
                   size_t min_bp_coverage);
 
@@ -76,6 +84,11 @@ void augment_impl(MutablePathMutableHandleGraph* graph,
 /// to exist exactly in the graph
 path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path);
 
+/// Compute the average base quality of an edit.
+/// If the edit has no sequence or there are no base_quals given,
+/// then double_max is returned. 
+double get_avg_baseq(const Edit& edit, const string& base_quals, size_t position_in_read);
+
 /// Find all the points at which a Path enters or leaves nodes in the graph. Adds
 /// them to the given map by node ID of sets of bases in the node that will need
 /// to become the starts of new nodes.
@@ -89,7 +102,8 @@ path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path)
 ///
 /// If break_ends is true, emits breakpoints at the ends of the path, even
 /// if it starts/ends with perfect matches.
-void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpoints, bool break_ends = true);
+void find_breakpoints(const Path& path, unordered_map<id_t, set<pos_t>>& breakpoints, bool break_ends = true,
+                      const string& base_quals = "", double min_baseq = 0);
 
 /// Flips the breakpoints onto the forward strand.
 unordered_map<id_t, set<pos_t>> forwardize_breakpoints(const HandleGraph* graph,
@@ -97,7 +111,8 @@ unordered_map<id_t, set<pos_t>> forwardize_breakpoints(const HandleGraph* graph,
 
 
 /// Like "find_breakpoints", but store in packed structure (better for large gams and enables coverage filter)
-void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends = true);
+void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends = true,
+                             const string& base_quals = "", double min_baseq = 0);
 
 /// Filters the breakpoints by coverage, and converts them back from the Packer to the STL map
 /// expected by following methods
@@ -121,7 +136,8 @@ map<pos_t, id_t> ensure_breakpoints(MutableHandleGraph* graph,
 /// out due to insufficient coverage.  This way, subsequent logic in add_nodes_and_edges
 /// can be run correctly.  Returns true if at least one edit survived the filter.
 bool simplify_filtered_edits(HandleGraph* graph, Path& path, const map<pos_t, id_t>& node_translation,
-                             const unordered_map<id_t, size_t>& orig_node_sizes);
+                             const unordered_map<id_t, size_t>& orig_node_sizes,
+                             const string& base_quals = "", double min_baseq = 0);
 
 /// Given a path on nodes that may or may not exist, and a map from start
 /// position in the old graph to a node in the current graph, add all the
diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp
index abd0cdaa586..994a65c4050 100644
--- a/src/subcommand/augment_main.cpp
+++ b/src/subcommand/augment_main.cpp
@@ -51,10 +51,12 @@ void help_augment(char** argv, ConfigurableParser& parser) {
          << "    -s, --subgraph              graph is a subgraph of the one used to create GAM. ignore alignments with missing nodes" << endl
          << "    -m, --min-coverage N        minimum coverage of a breakpoint required for it to be added to the graph" << endl
          << "    -c, --expected-cov N        expected coverage.  used only for memory tuning [default : 128]" << endl
+         << "    -q, --min-baseq N           ignore edits whose sequence have average base quality < N" << endl
+         << "    -Q, --min-mapq N            ignore alignments with mapping quality < N" << endl
          << "    -h, --help                  print this help message" << endl
          << "    -p, --progress              show progress" << endl
          << "    -v, --verbose               print information and warnings about vcf generation" << endl
-         << "    -t, --threads N             number of threads to use" << endl
+         << "    -t, --threads N             number of threads (only 1st pass with -m or -q option is multithreaded)" << endl
          << "loci file options:" << endl
          << "    -l, --include-loci FILE     merge all alleles in loci into the graph" << endl       
          << "    -L, --include-gt FILE       merge only the alleles in called genotypes into the graph" << endl;
@@ -100,6 +102,12 @@ int main_augment(int argc, char** argv) {
     // Used to set data_width for Packer
     size_t expected_coverage = 128;
 
+    // Minimum average base quality in an edit's sequence for it to be used
+    double min_baseq = 0;
+
+    // Minimum mapping quality of an alignment for it to be used
+    double min_mapq = 0;
+
     // Print some progress messages to screen
     bool show_progress = false;
 
@@ -117,7 +125,9 @@ int main_augment(int argc, char** argv) {
         {"label-paths", no_argument, 0, 'B'},
         {"subgraph", no_argument, 0, 's'},
         {"min-coverage", required_argument, 0, 'm'},
-        {"expected-cov", required_argument, 0, 'c'},        
+        {"expected-cov", required_argument, 0, 'c'},
+        {"min-baseq", required_argument, 0, 'q'},
+        {"min-mapq", required_argument, 0, 'Q'},
         {"help", no_argument, 0, 'h'},
         {"progress", required_argument, 0, 'p'},
         {"verbose", no_argument, 0, 'v'},
@@ -127,7 +137,7 @@ int main_augment(int argc, char** argv) {
         {"include-gt", required_argument, 0, 'L'},
         {0, 0, 0, 0}
     };
-    static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:";
+    static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:q:Q:";
     optind = 2; // force optind past command positional arguments
 
     // This is our command-line parser
@@ -163,7 +173,13 @@ int main_augment(int argc, char** argv) {
             break;
         case 'c':
             expected_coverage = parse<size_t>(optarg);
-            break;            
+            break;
+        case 'q':
+            min_baseq = parse<double>(optarg);
+            break;
+        case 'Q':
+            min_mapq = parse<double>(optarg);
+            break;
         case 'h':
         case '?':
             /* getopt_long already printed an error message. */
@@ -246,7 +262,9 @@ int main_augment(int argc, char** argv) {
     HandleGraph* vectorizable_graph = nullptr;
     unique_ptr<Packer> packer;
     bdsg::VectorizableOverlayHelper overlay_helper;
-    if (min_coverage > 0) {
+    // the packer's required for any kind of filtering logic -- so we use it when
+    // baseq is present as well.
+    if (min_coverage > 0 || min_baseq ) {
         vectorizable_graph = dynamic_cast<HandleGraph*>(overlay_helper.apply(graph.get()));
         size_t data_width = Packer::estimate_data_width(expected_coverage);
         size_t bin_count = Packer::estimate_bin_count(get_thread_count());
@@ -328,6 +346,8 @@ int main_augment(int argc, char** argv) {
                     include_paths,
                     !include_softclips,
                     is_subgraph,
+                    min_baseq,
+                    min_mapq,
                     packer.get(),
                     min_coverage);
         } else {
@@ -341,6 +361,8 @@ int main_augment(int argc, char** argv) {
                             include_paths,
                             !include_softclips,
                             is_subgraph,
+                            min_baseq,
+                            min_mapq,
                             packer.get(),
                             min_coverage);
                 });
diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t
index 109927fe822..ed66c2dc910 100644
--- a/test/t/17_vg_augment.t
+++ b/test/t/17_vg_augment.t
@@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 PATH=../bin:$PATH # for vg
 
 
-plan tests 21
+plan tests 23
 
 vg view -J -v pileup/tiny.json > tiny.vg
 
@@ -96,6 +96,22 @@ vg augment flat.vg 4edits.gam -m 11 | vg view - | grep S | awk '{print $3}' | so
 diff 2snp_default.nodes 4edits_m11.nodes
 is "$?" 0 "augmenting 2 snps and 2 errors with -m 11 produces the same nodes as with just the snps"
 
+# 2 snps, but one has a low quality, and one has a high quality
+echo "@read" > qual.fq
+echo "CAAATAAGGCTTGGAAATTGTCTGGAGTTCTATTATATGCCAACTCTCTG" >> qual.fq
+echo "+" >> qual.fq
+echo "BBBBBBBBBBBBBBBBBBB+BBBBBBBBBBBBBBBBBBKBBBBBBBBBBB" >> qual.fq
+# reverse complement
+echo "@daer" >> qual.fq
+echo "CAGAGAGTTGGCATATAATAGAACTCCAGACAATTTCCAAGCCTTATTTG" >> qual.fq
+echo "+" >> qual.fq
+echo "BBBBBBBBBBBKBBBBBBBBBBBBBBBBBB+BBBBBBBBBBBBBBBBBBB" >> qual.fq
+vg map -g flat.gcsa -x flat.xg -f qual.fq -k 8 > 2qual.gam
+# sanity check:
+is $(vg augment flat.vg 2qual.gam -m 2 | vg view - | grep ^S | wc -l) 7 "augmenting with 2snps makes correct number of nodes"
+# test quality filter
+is $(vg augment flat.vg 2qual.gam -m 2 -q 30 | vg view - | grep ^S | wc -l) 4 "low-quality snp is filtered"
+
 vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort > vg_augment.nodes
 vg convert flat.vg -p > flat.pg
 vg augment flat.pg 2snp.gam | vg convert -v - | vg view - | grep S | awk '{print $3}' | sort > packed_graph_augment.nodes
@@ -107,4 +123,4 @@ diff vg_augment.nodes hash_graph_augment.nodes
 is "$?" 0 "augmenting a hash graph produces same results as a vg graph"
 
 rm -f flat.vg flat.gcsa flat.xg flat.pg flat.hg 2snp.vg 2snp.xg 2snp.sim 2snp.gam vg_augment.nodes packed_graph_augment.nodes hash_graph_augment.nodes
-rm -f 2err.sim 2err.gam 4edits.gam 2snp_default.nodes 2snp_m1.nodes 4edits_m11.nodes
+rm -f 2err.sim 2err.gam 4edits.gam 2snp_default.nodes 2snp_m1.nodes 4edits_m11.nodes 2qual.gam qual.fq

From 9c0bf18e7e4fedffa322e51e3c2c152cde0a9e4a Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 30 Oct 2019 16:39:28 -0700
Subject: [PATCH 10/79] Let distance indexing take any handle graph, and
 document that -x can feed it an XG

---
 src/subcommand/index_main.cpp | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp
index 054da2bb431..682c11f380a 100644
--- a/src/subcommand/index_main.cpp
+++ b/src/subcommand/index_main.cpp
@@ -41,7 +41,7 @@ void help_index(char** argv) {
          << "    -t, --threads N        number of threads to use" << endl
          << "    -p, --progress         show progress" << endl
          << "xg options:" << endl
-         << "    -x, --xg-name FILE     use this file to store a succinct, queryable version of the graph(s), or read for GCSA indexing" << endl
+         << "    -x, --xg-name FILE     use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl
          << "    -L, --xg-alts          include alt paths in xg" << endl
          << "gbwt options:" << endl
          << "    -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl
@@ -1234,6 +1234,8 @@ int main_index(int argc, char** argv) {
         if (file_names.empty() && xg_name.empty()) {
             cerr << "error: [vg index] one graph is required to build a distance index" << endl;
             return 1;
+        } else if (file_names.size() > 1 || (file_names.size() == 1 && !xg_name.empty())) {
+            cerr << "error: [vg index] only one graph at a time can be used to build a distance index" << endl;
         } else if (dist_name.empty()) {
             cerr << "error: [vg index] distance index requires an output file" << endl;
             return 1;
@@ -1253,31 +1255,25 @@ int main_index(int argc, char** argv) {
 
             //Get graph and build dist index
             if (file_names.empty() && !xg_name.empty()) {
+                // We were given a -x specifically to read as XG
                 
                 ifstream xg_stream(xg_name);
                 auto xg = vg::io::VPKG::load_one<xg::XG>(xg_stream);
 
                 // Create the MinimumDistanceIndex
-                MinimumDistanceIndex di (xg.get(), snarl_manager);
+                MinimumDistanceIndex di(xg.get(), snarl_manager);
                 // Save the completed DistanceIndex
-                ofstream ostream (dist_name);
+                ofstream ostream(dist_name);
                 di.serialize(ostream);
 
             } else {
-                ifstream vg_stream(file_names.at(0));
-                
-                if (!vg_stream) {
-                    cerr << "error: [vg index] cannot open VG file" << endl;
-                    exit(1);
-                }
-
-                VG vg(vg_stream);
-                vg_stream.close();
+                // We were given a graph generically
+                auto graph = vg::io::VPKG::load_one<handlegraph::HandleGraph>(file_names.at(0));
     
                 // Create the MinimumDistanceIndex
-                MinimumDistanceIndex di (&vg, snarl_manager);
+                MinimumDistanceIndex di(graph.get(), snarl_manager);
                 // Save the completed DistanceIndex
-                ofstream ostream (dist_name);
+                ofstream ostream(dist_name);
                 di.serialize(ostream);
 //                vg::io::VPKG::save(di, dist_name);
             }

From 2c060bce77ea30a0ffcc1e50b11bef3fa84442b9 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 31 Oct 2019 09:57:32 -0400
Subject: [PATCH 11/79] update libvgio

---
 deps/libvgio | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/libvgio b/deps/libvgio
index 1248a9b6444..fe25b2be07c 160000
--- a/deps/libvgio
+++ b/deps/libvgio
@@ -1 +1 @@
-Subproject commit 1248a9b64440b91ba8431cad8f3aa72f734b5bef
+Subproject commit fe25b2be07cbbcf98ed2380213b1fa064e04f68d

From eb66165e0dc10fdf20eb01ef592c80b5525b1362 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 31 Oct 2019 16:42:46 -0400
Subject: [PATCH 12/79] start vg depth

---
 src/algorithms/coverage_depth.cpp | 207 ++++++++++++++++++++++++++++++
 src/algorithms/coverage_depth.hpp |  48 +++++++
 src/subcommand/depth_main.cpp     | 200 +++++++++++++++++++++++++++++
 3 files changed, 455 insertions(+)
 create mode 100644 src/algorithms/coverage_depth.cpp
 create mode 100644 src/algorithms/coverage_depth.hpp
 create mode 100644 src/subcommand/depth_main.cpp

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
new file mode 100644
index 00000000000..e848161124f
--- /dev/null
+++ b/src/algorithms/coverage_depth.cpp
@@ -0,0 +1,207 @@
+#include "coverage_depth.hpp"
+#include <bdsg/hash_graph.hpp>
+#include "algorithms/subgraph.hpp"
+#include <vg/io/stream.hpp>
+#include "../path.hpp"
+
+namespace vg {
+namespace algorithms {
+
+/// Estimate the depth of coverage of a given (sub) graph using the packer
+/// Coverage is computed relative to the given path
+double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path) {
+
+    // get the path length
+    path_handle_t path_handle = graph.get_path_handle(ref_path);
+    size_t path_len = 0;
+    for (handle_t handle : graph.scan_path(path_handle)) {
+        path_len += graph.get_length(handle);
+    }
+    if (path_len == 0) {
+        return 0;
+    }
+
+    // sum up the coverage
+    size_t tot_base_coverage = 0;
+    graph.for_each_handle([&] (handle_t handle) {
+            Position pos;
+            pos.set_node_id(graph.get_id(handle));
+            size_t packer_pos = packer.position_in_basis(pos);
+            size_t node_len = graph.get_length(handle);
+            for (size_t offset = 0; offset < node_len; ++offset) {
+                tot_base_coverage += packer.coverage_at_position(packer_pos + offset);
+            }
+        });
+
+    // return average (over the path)
+    return (double)tot_base_coverage / (double)path_len;
+}
+
+
+/// Estimate the binned coverage along a path
+map<size_t, double> binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path,
+                                        size_t step, size_t context, size_t threads) {
+
+    // move forward along path (note: this can be sped up if we're given a PathPositionHandleGraph but I don't think
+    // it matters for a couple of scans. 
+    function<size_t(step_handle_t&, size_t)> advance = [&] (step_handle_t& step_handle, size_t distance) {
+        size_t went = 0;
+        for (; graph.has_next_step(step_handle) && went < distance; step_handle = graph.get_next_step(step_handle)) {
+            went += graph.get_length(graph.get_handle_of_step(step_handle));
+        }
+        return went;
+    };
+
+    path_handle_t path_handle = graph.get_path_handle(ref_path);
+    step_handle_t step_handle = graph.path_begin(path_handle);
+
+    // hop along the graph, grabbing a step handle every "step" bases (or thereabouts)
+    vector<pair<size_t, step_handle_t>> bin_centers;
+    size_t pos = advance(step_handle, step / 2);
+    if (pos >= step / 2) {
+        size_t went;
+        do {
+            if (bin_centers.empty() || step_handle != bin_centers.back().second) {
+                bin_centers.push_back(make_pair(pos, step_handle));
+            }
+            went = advance(step_handle, step);
+            pos += went;
+        } while (went >= step);
+    }
+
+    // our graph's too small to do any stepping, just use the first handle
+    if (bin_centers.empty()) {
+        bin_centers.push_back(make_pair(0, graph.path_begin(path_handle)));
+    }
+
+    // visit every bin center and make a subgraph to collect coverage from
+    if (threads == 0) {
+        threads = get_thread_count();
+    }
+    map<size_t, double> binned_depths;
+#pragma omp parallel for num_threads(threads)
+    for (size_t i = 0; i < bin_centers.size(); ++i) {
+        // extract the subgraph
+        bdsg::HashGraph subgraph;
+        step_handle_t bin_step = bin_centers[i].second;
+        handle_t bin_handle = graph.get_handle_of_step(bin_step);
+        assert(graph.get_is_reverse(bin_handle) == false);
+        subgraph.create_handle(graph.get_sequence(bin_handle), graph.get_id(bin_handle));
+        expand_subgraph_by_steps(graph, subgraph, context);
+
+        // sum up the coverage on the subgraph
+        size_t tot_base_coverage = 0;
+        size_t tot_ref_len = 0;
+        subgraph.for_each_handle([&] (handle_t sub_handle) {
+                // go back into the original graph because we don't have any
+                // path information in the subgraph because we are unable
+                // to get it without requiring the path position interface
+                handle_t orig_handle = graph.get_handle(subgraph.get_id(sub_handle));
+                Position pos;
+                pos.set_node_id(graph.get_id(orig_handle));
+                size_t packer_pos = packer.position_in_basis(pos);
+                size_t node_len = graph.get_length(orig_handle);
+                for (size_t offset = 0; offset < node_len; ++offset) {
+                    tot_base_coverage += packer.coverage_at_position(packer_pos + offset);
+                }
+                // we manually test if each handle is on our reference path (again, to
+                // not require path position interface)
+                vector<step_handle_t> step_path_handles = graph.steps_of_handle(orig_handle);
+                bool on_ref = false;
+                for (size_t j = 0; j < step_path_handles.size() && !on_ref; ++j) {
+                    on_ref = graph.get_path_handle_of_step(step_path_handles[j]) == path_handle;
+                }
+                if (on_ref) {
+                    tot_ref_len += node_len;
+                }
+            });
+        
+        assert(tot_ref_len > 0);
+        double avg_base_coverage = tot_base_coverage / tot_ref_len;
+        
+#pragma omp critical (update_binned_depth)
+        binned_depths[bin_centers[i].first] = avg_base_coverage;
+    }
+
+    return binned_depths;
+
+}
+
+// draw (roughly) max_nodes nodes from the graph using the random seed
+static unordered_map<nid_t, size_t> sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) {
+    default_random_engine generator(random_seed);
+    uniform_real_distribution<double> distribution(0, 1);
+    double cutoff = std::min((double)1.0, (double)(max_nodes / graph.get_node_count()));
+    unordered_map<nid_t, size_t> sampled_nodes;
+    graph.for_each_handle([&](handle_t handle) {
+        if (cutoff == 1 || cutoff < distribution(generator)) {
+            sampled_nodes[graph.get_id(handle)] = 0;
+        }
+      });
+    return sampled_nodes;
+}
+
+// update the coverage from an alignment.  only count nodes that are in the map already
+static void update_sample_gam_depth(const Alignment& aln, unordered_map<nid_t, size_t>& node_coverage) {
+    const Path& path = aln.path();
+    for (int i = 0; i < path.mapping_size(); ++i) {
+        const Mapping& mapping = path.mapping(i);
+        nid_t node_id = mapping.position().node_id();
+        if (node_coverage.count(node_id)) {
+            ++node_coverage[node_id];
+        } 
+    }
+}
+
+// sum up the results from the different threads and return the average.
+// if a min_coverage is given, nodes with less coverage are ignored
+static double combine_and_average_node_coverages(vector<unordered_map<nid_t, size_t>>& node_coverages, size_t min_coverage) {
+    for (int i = 1; i < node_coverages.size(); ++i) {
+        for (const auto& node_cov : node_coverages[i]) {
+            node_coverages[0][node_cov.first] += node_cov.second;
+        }
+    }
+    size_t tot_coverage = 0;
+    size_t tot_count = 0;
+    for (const auto & node_cov : node_coverages[0]) {
+        if (node_cov.second >= min_coverage) {
+            tot_coverage += node_cov.second;
+            ++tot_count;
+        }
+    }
+
+    return tot_count > 0 ? (double)tot_coverage / (double)tot_count : 0;
+}
+
+
+double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage) {
+    // one node counter per thread
+    vector<unordered_map<nid_t, size_t>> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed));
+
+    function<void(Alignment& aln)> aln_callback = [&](Alignment& aln) {
+        update_sample_gam_depth(aln, node_coverages[omp_get_thread_num()]);
+    };
+    vg::io::for_each_parallel(gam_stream, aln_callback);
+    return combine_and_average_node_coverages(node_coverages, min_coverage);
+}
+
+double sample_gam_depth(const HandleGraph& graph, const vector<Alignment>& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage) {
+    // one node counter per thread
+    vector<unordered_map<nid_t, size_t>> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed));
+
+#pragma omp parallel for
+    for (size_t i = 0; i < alignments.size(); ++i) {
+        update_sample_gam_depth(alignments[i], node_coverages[omp_get_thread_num()]);
+    }
+    return combine_and_average_node_coverages(node_coverages, min_coverage);
+}
+
+
+
+}
+
+
+
+
+}
+
diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp
new file mode 100644
index 00000000000..b7ffb645a4a
--- /dev/null
+++ b/src/algorithms/coverage_depth.hpp
@@ -0,0 +1,48 @@
+#ifndef VG_DEPTH_HPP_INCLUDED
+#define VG_DEPTH_HPP_INCLUDED
+
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <cmath>
+#include <limits>
+#include <unordered_set>
+#include <tuple>
+#include "handle.hpp"
+#include "packer.hpp"
+
+
+namespace vg {
+namespace algorithms {
+
+using namespace std;
+
+/// Estimate the depth of coverage of a given (sub) graph using the packer
+/// Coverage is computed relative to the given path
+double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path);
+
+/// Estimate the binned coverage along a path using the packer
+/// ref_path is scanned, and every "step" bases as subgraph is extracted using the given number of context steps
+/// If threads is 0, all the threads are used
+map<size_t, double> binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path,
+                                        size_t step, size_t context, size_t threads = 0);
+
+
+/// Get the depth of a bin
+/// the "k_nearest" closest bins to the given position are used
+/// bins with coverage below min_coverage are ignored
+double get_binned_depth(const unordered_map<size_t, double>& binned_depths, size_t pos, size_t k_nearest = 3, double min_coverage = 1.0);
+
+/// Return the average depth of coverage of randomly sampled nodes from a GAM
+/// Nodes with less than min_coverage are ignored
+/// The stream is scanned in parallel with all threads
+/// max_nodes is used to keep memory down
+double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0);
+
+/// As above, but read a vector instead of a stream
+double sample_gam_depth(const HandleGraph& graph, const vector<Alignment>& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0);
+
+}
+}
+
+#endif
diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp
new file mode 100644
index 00000000000..7a892397a3e
--- /dev/null
+++ b/src/subcommand/depth_main.cpp
@@ -0,0 +1,200 @@
+/** \file depth_main.cpp
+ *
+ * Estimate sequencing depth from a (packed) alignment.
+ */
+
+
+#include <omp.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include "subcommand.hpp"
+
+#include <vg/io/vpkg.hpp>
+#include <vg/io/stream.hpp>
+#include "../handle.hpp"
+#include <bdsg/overlay_helper.hpp>
+#include "../utility.hpp"
+#include "../packer.hpp"
+#include "algorithms/coverage_depth.hpp"
+
+using namespace std;
+using namespace vg;
+using namespace vg::subcommand;
+
+void help_depth(char** argv) {
+    cerr << "usage: " << argv[0] << " depth [options] <graph>" << endl
+         << "options:" << endl
+         << "  packed coverage depth:" << endl
+         << "    -k, --pack FILE       Supports created from vg pack for given input graph" << endl
+         << "    -p, --ref-path NAME   Reference path to call on (multipile allowed.  defaults to all paths)" << endl
+         << "    -c, --context-size N  Context size (steps) for expanding bin subgraphs [50]" << endl
+         << "    -b, --bin-size N      Bin size (in bases) [10000000]" << endl
+         << "  GAM coverage depth:" << endl
+         << "    -g, --gam FILE        read alignments from this file (could be '-' for stdin)" << endl
+         << "    -n, --max-nodes N     maximum nodes to consider [1000000]" << endl
+         << "    -s, --random-seed N   random seed for sampling nodes to consider" << endl
+         << "  common options:" << endl
+         << "    -m, --min-coverage N  ignore nodes with less than N coverage [1]" << endl
+         << "    -t, --threads N       Number of threads to use [all available]" << endl;
+}
+
+int main_depth(int argc, char** argv) {
+
+    if (argc == 2) {
+        help_depth(argv);
+        return 1;
+    }
+
+    string pack_filename;
+    vector<string> ref_paths;
+    size_t context_steps = 50;
+    size_t bin_size = 10000000;
+    
+    string gam_filename;
+    size_t max_nodes = 1000000;
+    int random_seed = time(NULL);
+
+    size_t min_coverage = 1;
+
+    int c;
+    optind = 2; // force optind past command positional argument
+    while (true) {
+
+        static const struct option long_options[] = {
+            {"pack", required_argument, 0, 'k'},            
+            {"ref-path", required_argument, 0, 'p'},
+            {"context-size", required_argument, 0, 'c'},
+            {"bin-size", required_argument, 0, 'b'},
+            {"gam", required_argument, 0, 'g'},
+            {"max-nodes", required_argument, 0, 'n'},
+            {"random-seed", required_argument, 0, 's'},
+            {"min-coverage", required_argument, 0, 'm'},
+            {"threads", required_argument, 0, 't'},
+            {"help", no_argument, 0, 'h'},
+            {0, 0, 0, 0}
+        };
+
+        int option_index = 0;
+        c = getopt_long (argc, argv, "hk:p:c:b:g:n:s:m:t:",
+                long_options, &option_index);
+
+        // Detect the end of the options.
+        if (c == -1)
+            break;
+
+        switch (c)
+        {
+        case 'k':
+            pack_filename = optarg;
+            break;
+        case 'p':
+            ref_paths.push_back(optarg);
+            break;            
+        case 'c':
+            context_steps = parse<size_t>(optarg);
+            break;
+        case 'b':
+            bin_size = parse<size_t>(optarg);
+            break;
+        case 'g':
+            gam_filename = optarg;
+            break;
+        case 'n':
+            max_nodes = parse<size_t>(optarg);
+            break;
+        case 's':
+            random_seed = parse<size_t>(optarg);
+            break;
+        case 'm':
+            min_coverage = parse<size_t>(optarg);
+            break;
+        case 't':
+        {
+            int num_threads = parse<int>(optarg);
+            if (num_threads <= 0) {
+                cerr << "error:[vg depth] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
+                exit(1);
+            }
+            omp_set_num_threads(num_threads);
+            break;
+        }
+        case 'h':
+        case '?':
+            /* getopt_long already printed an error message. */
+            help_depth(argv);
+            exit(1);
+            break;
+        default:
+            abort ();
+        }
+    }
+
+    if (argc <= 2) {
+        help_depth(argv);
+        return 1;
+    }
+
+    if (pack_filename.empty() == gam_filename.empty() ) {
+        cerr << "error:[vg depth] Either a pack file (-k) or a gam file (-g) must be given" << endl;
+        exit(1);
+    }
+
+    // Read the graph
+    unique_ptr<PathHandleGraph> path_handle_graph;
+    get_input_file(optind, argc, argv, [&](istream& in) {
+            path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(in);
+        });
+    PathHandleGraph* graph = path_handle_graph.get();
+    
+    // Apply the overlay if necessary
+    bdsg::PathVectorizableOverlayHelper overlay_helper;
+    if (!pack_filename.empty()) {
+        graph = dynamic_cast<PathHandleGraph*>(overlay_helper.apply(path_handle_graph.get()));
+        assert(graph != nullptr);
+    }
+
+    // Process the pack
+    unique_ptr<Packer> packer;
+    if (!pack_filename.empty()) {        
+        // Load our packed supports (they must have come from vg pack on graph)
+        packer = unique_ptr<Packer>(new Packer(graph));
+        packer->load_from_file(pack_filename);
+
+        // All paths if none given
+        if (ref_paths.empty()) {
+            graph->for_each_path_handle([&](path_handle_t path_handle) {
+                    string path_name = graph->get_path_name(path_handle);
+                    if (!Paths::is_alt(path_name)) {
+                        ref_paths.push_back(path_name);
+                    }
+                });
+        }
+
+        for (const string& ref_path : ref_paths) {
+            map<size_t, double> binned_depth = algorithms::binned_packed_depth(*graph, *packer, ref_path, bin_size, get_thread_count());
+            for (auto& bin_cov : binned_depth) {
+                cerr << ref_path << "\t" << bin_cov.first << "\t" << bin_cov.second << endl;
+            }
+        }
+    }
+
+    // Process the gam
+    if (!gam_filename.empty()) {
+        double gam_cov;
+        get_input_file(gam_filename, [&] (istream& gam_stream) {
+                gam_cov = algorithms::sample_gam_depth(*graph, gam_stream, max_nodes, random_seed, min_coverage);
+            });
+        cerr << "gam-coverage\t" << gam_cov << endl;
+    }
+
+    return 0;
+
+}
+
+// Register subcommand
+static Subcommand vg_depth("depth", "estimate sequencing depth", main_depth);
+

From a50d14f90c0461b58ff0a7abc641ed16dc0d2f12 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 1 Nov 2019 11:52:46 -0400
Subject: [PATCH 13/79] more sensible path coverage. include variance

---
 src/algorithms/coverage_depth.cpp | 249 ++++++++++++++++--------------
 src/algorithms/coverage_depth.hpp |  40 ++---
 src/subcommand/depth_main.cpp     |  65 +++++---
 src/utility.cpp                   |  18 +++
 src/utility.hpp                   |   3 +
 5 files changed, 213 insertions(+), 162 deletions(-)

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index e848161124f..351ed63e1f0 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -7,134 +7,136 @@
 namespace vg {
 namespace algorithms {
 
-/// Estimate the depth of coverage of a given (sub) graph using the packer
-/// Coverage is computed relative to the given path
-double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path) {
-
-    // get the path length
-    path_handle_t path_handle = graph.get_path_handle(ref_path);
-    size_t path_len = 0;
-    for (handle_t handle : graph.scan_path(path_handle)) {
-        path_len += graph.get_length(handle);
-    }
-    if (path_len == 0) {
-        return 0;
+void packed_depths(const Packer& packer, const string& path_name, size_t min_coverage, ostream& out_stream) {
+    const PathHandleGraph& graph = dynamic_cast<const PathHandleGraph&>(*packer.get_graph());
+    path_handle_t path_handle = graph.get_path_handle(path_name);
+    step_handle_t start_step = graph.path_begin(path_handle);
+    step_handle_t end_step = graph.path_end(path_handle);
+    Position cur_pos;
+    size_t path_offset = 1;
+    for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) {
+        handle_t cur_handle = graph.get_handle_of_step(cur_step);
+        nid_t cur_id = graph.get_id(cur_handle);
+        size_t cur_len = graph.get_length(cur_handle);
+        cur_pos.set_node_id(cur_id);
+        cur_pos.set_is_reverse(graph.get_is_reverse(cur_handle));
+        for (size_t i = 0; i < cur_len; ++i) {
+            cur_pos.set_offset(i);
+            size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos));
+            out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n";
+            ++path_offset;
+        }
     }
-
-    // sum up the coverage
-    size_t tot_base_coverage = 0;
-    graph.for_each_handle([&] (handle_t handle) {
-            Position pos;
-            pos.set_node_id(graph.get_id(handle));
-            size_t packer_pos = packer.position_in_basis(pos);
-            size_t node_len = graph.get_length(handle);
-            for (size_t offset = 0; offset < node_len; ++offset) {
-                tot_base_coverage += packer.coverage_at_position(packer_pos + offset);
-            }
-        });
-
-    // return average (over the path)
-    return (double)tot_base_coverage / (double)path_len;
 }
 
-
-/// Estimate the binned coverage along a path
-map<size_t, double> binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path,
-                                        size_t step, size_t context, size_t threads) {
-
-    // move forward along path (note: this can be sped up if we're given a PathPositionHandleGraph but I don't think
-    // it matters for a couple of scans. 
-    function<size_t(step_handle_t&, size_t)> advance = [&] (step_handle_t& step_handle, size_t distance) {
-        size_t went = 0;
-        for (; graph.has_next_step(step_handle) && went < distance; step_handle = graph.get_next_step(step_handle)) {
-            went += graph.get_length(graph.get_handle_of_step(step_handle));
+pair<double, double> packed_depth_of_bin(const Packer& packer,
+                                         step_handle_t start_step, step_handle_t end_plus_one_step,
+                                         size_t min_coverage, bool include_deletions) {
+
+    const PathHandleGraph& graph = dynamic_cast<const PathHandleGraph&>(*packer.get_graph());
+
+    // coverage of each node via deletion (that's contained in the bin)
+    unordered_map<nid_t, size_t> deletion_coverages;
+    if (include_deletions) {
+        const VectorizableHandleGraph* vec_graph = dynamic_cast<const VectorizableHandleGraph*>(packer.get_graph());
+        unordered_map<handle_t, step_handle_t> deletion_candidates;
+        handle_t prev_handle;
+        for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) {
+            handle_t cur_handle = graph.get_handle_of_step(cur_step);
+            graph.follow_edges(cur_handle, true, [&] (handle_t other) {
+                    if (!deletion_candidates.empty() && other!= prev_handle && deletion_candidates.count(other)) {
+                        edge_t edge = graph.edge_handle(other, cur_handle);
+                        size_t edge_pos = vec_graph->edge_index(edge);
+                        size_t deletion_coverage = packer.edge_coverage(edge_pos);
+                        // quadratic alert.  if this is too slow, can use interval tree or something
+                        for (step_handle_t del_step = graph.get_next_step(deletion_candidates[other]);
+                             del_step != cur_step;
+                             del_step = graph.get_next_step(del_step)) {
+                            handle_t del_handle = graph.get_handle_of_step(del_step);
+                            nid_t del_id = graph.get_id(del_handle);
+                            if (!deletion_coverages.count(del_id)) {
+                                deletion_coverages[del_id] = deletion_coverage;
+                            } else {
+                                deletion_coverages[del_id] += deletion_coverage;
+                            }
+                        }
+                    }
+                });
+            prev_handle = cur_handle;
+            deletion_candidates[cur_handle] = cur_step;
         }
-        return went;
-    };
+    }
 
-    path_handle_t path_handle = graph.get_path_handle(ref_path);
-    step_handle_t step_handle = graph.path_begin(path_handle);
-
-    // hop along the graph, grabbing a step handle every "step" bases (or thereabouts)
-    vector<pair<size_t, step_handle_t>> bin_centers;
-    size_t pos = advance(step_handle, step / 2);
-    if (pos >= step / 2) {
-        size_t went;
-        do {
-            if (bin_centers.empty() || step_handle != bin_centers.back().second) {
-                bin_centers.push_back(make_pair(pos, step_handle));
+    // compute the mean and variance of our base coverage across the bin
+    size_t bin_length = 0;
+    double mean = 0.0;
+    double M2 = 0.0;
+
+    for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) {
+        handle_t cur_handle = graph.get_handle_of_step(cur_step);
+        nid_t cur_id = graph.get_id(cur_handle);
+        size_t cur_len = graph.get_length(cur_handle);
+        size_t del_coverage = !include_deletions or !deletion_coverages.count(cur_id) ? 0 : deletion_coverages[cur_id];
+        Position cur_pos;
+        cur_pos.set_node_id(cur_id);
+        cur_pos.set_is_reverse(graph.get_is_reverse(cur_handle));
+        for (size_t i = 0; i < cur_len; ++i) {
+            cur_pos.set_offset(i);
+            size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos)) + del_coverage;
+            if (pos_coverage >= min_coverage) {
+                wellford_update(bin_length, mean, M2, pos_coverage);
             }
-            went = advance(step_handle, step);
-            pos += went;
-        } while (went >= step);
+        }
     }
+    return wellford_mean_var(bin_length, mean, M2, true);
+}
 
-    // our graph's too small to do any stepping, just use the first handle
-    if (bin_centers.empty()) {
-        bin_centers.push_back(make_pair(0, graph.path_begin(path_handle)));
+vector<tuple<size_t, size_t, double, double>> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size,
+                                                                  size_t min_coverage, bool include_deletions) {
+
+    const PathHandleGraph& graph = dynamic_cast<const PathHandleGraph&>(*packer.get_graph());
+    path_handle_t path_handle = graph.get_path_handle(path_name);
+    
+    // one scan of our path to collect the bins
+    step_handle_t start_step = graph.path_begin(path_handle);
+    step_handle_t end_step = graph.path_end(path_handle);
+    vector<pair<size_t, step_handle_t>> bins; // start offset / start step of each bin
+    size_t offset = 0;
+    size_t cur_bin_size = bin_size;
+    for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) {
+        if (cur_bin_size >= bin_size) {
+            bins.push_back(make_pair(offset, cur_step));
+            cur_bin_size = 0;
+        }
+        size_t node_len = graph.get_length(graph.get_handle_of_step(cur_step));
+        offset += node_len;
+        cur_bin_size += node_len;
     }
 
-    // visit every bin center and make a subgraph to collect coverage from
-    if (threads == 0) {
-        threads = get_thread_count();
-    }
-    map<size_t, double> binned_depths;
-#pragma omp parallel for num_threads(threads)
-    for (size_t i = 0; i < bin_centers.size(); ++i) {
-        // extract the subgraph
-        bdsg::HashGraph subgraph;
-        step_handle_t bin_step = bin_centers[i].second;
-        handle_t bin_handle = graph.get_handle_of_step(bin_step);
-        assert(graph.get_is_reverse(bin_handle) == false);
-        subgraph.create_handle(graph.get_sequence(bin_handle), graph.get_id(bin_handle));
-        expand_subgraph_by_steps(graph, subgraph, context);
-
-        // sum up the coverage on the subgraph
-        size_t tot_base_coverage = 0;
-        size_t tot_ref_len = 0;
-        subgraph.for_each_handle([&] (handle_t sub_handle) {
-                // go back into the original graph because we don't have any
-                // path information in the subgraph because we are unable
-                // to get it without requiring the path position interface
-                handle_t orig_handle = graph.get_handle(subgraph.get_id(sub_handle));
-                Position pos;
-                pos.set_node_id(graph.get_id(orig_handle));
-                size_t packer_pos = packer.position_in_basis(pos);
-                size_t node_len = graph.get_length(orig_handle);
-                for (size_t offset = 0; offset < node_len; ++offset) {
-                    tot_base_coverage += packer.coverage_at_position(packer_pos + offset);
-                }
-                // we manually test if each handle is on our reference path (again, to
-                // not require path position interface)
-                vector<step_handle_t> step_path_handles = graph.steps_of_handle(orig_handle);
-                bool on_ref = false;
-                for (size_t j = 0; j < step_path_handles.size() && !on_ref; ++j) {
-                    on_ref = graph.get_path_handle_of_step(step_path_handles[j]) == path_handle;
-                }
-                if (on_ref) {
-                    tot_ref_len += node_len;
-                }
-            });
-        
-        assert(tot_ref_len > 0);
-        double avg_base_coverage = tot_base_coverage / tot_ref_len;
-        
-#pragma omp critical (update_binned_depth)
-        binned_depths[bin_centers[i].first] = avg_base_coverage;
+    // parallel scan to compute the coverages
+    vector<tuple<size_t, size_t, double, double>> binned_depths(bins.size());
+#pragma omp parallel for
+    for (size_t i = 0; i < bins.size(); ++i) {
+        step_handle_t bin_start_step = bins[i].second;
+        step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step;
+        size_t bin_start = bins[i].first;
+        size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset;
+        pair<double, double> coverage = packed_depth_of_bin(packer, bin_start_step, bin_end_step, min_coverage, include_deletions);
+        binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second);
     }
 
     return binned_depths;
-
 }
 
+
 // draw (roughly) max_nodes nodes from the graph using the random seed
 static unordered_map<nid_t, size_t> sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) {
     default_random_engine generator(random_seed);
     uniform_real_distribution<double> distribution(0, 1);
-    double cutoff = std::min((double)1.0, (double)(max_nodes / graph.get_node_count()));
+    double cutoff = std::min((double)1.0, (double)max_nodes / (double)graph.get_node_count());
     unordered_map<nid_t, size_t> sampled_nodes;
     graph.for_each_handle([&](handle_t handle) {
-        if (cutoff == 1 || cutoff < distribution(generator)) {
+        if (cutoff == 1. || cutoff <= distribution(generator)) {
             sampled_nodes[graph.get_id(handle)] = 0;
         }
       });
@@ -148,52 +150,59 @@ static void update_sample_gam_depth(const Alignment& aln, unordered_map<nid_t, s
         const Mapping& mapping = path.mapping(i);
         nid_t node_id = mapping.position().node_id();
         if (node_coverage.count(node_id)) {
-            ++node_coverage[node_id];
+            // we add the number of bases covered
+            node_coverage[node_id] += mapping_from_length(mapping);
         } 
     }
 }
 
 // sum up the results from the different threads and return the average.
 // if a min_coverage is given, nodes with less coverage are ignored
-static double combine_and_average_node_coverages(vector<unordered_map<nid_t, size_t>>& node_coverages, size_t min_coverage) {
+static pair<double, double> combine_and_average_node_coverages(const HandleGraph& graph, vector<unordered_map<nid_t, size_t>>& node_coverages, size_t min_coverage) {
     for (int i = 1; i < node_coverages.size(); ++i) {
         for (const auto& node_cov : node_coverages[i]) {
             node_coverages[0][node_cov.first] += node_cov.second;
         }
     }
-    size_t tot_coverage = 0;
-    size_t tot_count = 0;
+    size_t count = 0;
+    double mean = 0.;
+    double M2 = 0.;
     for (const auto & node_cov : node_coverages[0]) {
         if (node_cov.second >= min_coverage) {
-            tot_coverage += node_cov.second;
-            ++tot_count;
+            // we normalize the bases covered by the node length as we sum
+            double node_len = graph.get_length(graph.get_handle(node_cov.first));
+            wellford_update(count, mean, M2, (double)node_cov.second / node_len);
         }
     }
 
-    return tot_count > 0 ? (double)tot_coverage / (double)tot_count : 0;
+    return wellford_mean_var(count, mean, M2, count < graph.get_node_count());
 }
 
 
-double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage) {
+pair<double, double> sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq) {
     // one node counter per thread
     vector<unordered_map<nid_t, size_t>> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed));
 
     function<void(Alignment& aln)> aln_callback = [&](Alignment& aln) {
-        update_sample_gam_depth(aln, node_coverages[omp_get_thread_num()]);
+        if (aln.mapping_quality() >= min_mapq) {
+            update_sample_gam_depth(aln, node_coverages[omp_get_thread_num()]);
+        }
     };
     vg::io::for_each_parallel(gam_stream, aln_callback);
-    return combine_and_average_node_coverages(node_coverages, min_coverage);
+    return combine_and_average_node_coverages(graph, node_coverages, min_coverage);
 }
 
-double sample_gam_depth(const HandleGraph& graph, const vector<Alignment>& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage) {
+pair<double, double> sample_gam_depth(const HandleGraph& graph, const vector<Alignment>& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq) {
     // one node counter per thread
     vector<unordered_map<nid_t, size_t>> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed));
 
 #pragma omp parallel for
     for (size_t i = 0; i < alignments.size(); ++i) {
-        update_sample_gam_depth(alignments[i], node_coverages[omp_get_thread_num()]);
+        if (alignments[i].mapping_quality() >= min_mapq) {
+            update_sample_gam_depth(alignments[i], node_coverages[omp_get_thread_num()]);
+        }
     }
-    return combine_and_average_node_coverages(node_coverages, min_coverage);
+    return combine_and_average_node_coverages(graph, node_coverages, min_coverage);
 }
 
 
diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp
index b7ffb645a4a..18d70daa26a 100644
--- a/src/algorithms/coverage_depth.hpp
+++ b/src/algorithms/coverage_depth.hpp
@@ -17,30 +17,32 @@ namespace algorithms {
 
 using namespace std;
 
-/// Estimate the depth of coverage of a given (sub) graph using the packer
-/// Coverage is computed relative to the given path
-double packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path);
-
-/// Estimate the binned coverage along a path using the packer
-/// ref_path is scanned, and every "step" bases as subgraph is extracted using the given number of context steps
-/// If threads is 0, all the threads are used
-map<size_t, double> binned_packed_depth(const PathHandleGraph& graph, const Packer& packer, const string& ref_path,
-                                        size_t step, size_t context, size_t threads = 0);
-
-
-/// Get the depth of a bin
-/// the "k_nearest" closest bins to the given position are used
-/// bins with coverage below min_coverage are ignored
-double get_binned_depth(const unordered_map<size_t, double>& binned_depths, size_t pos, size_t k_nearest = 3, double min_coverage = 1.0);
-
-/// Return the average depth of coverage of randomly sampled nodes from a GAM
+/// print path-name offset base-coverage for every base on a path (just like samtools depth)
+/// ignoring things below min_coverage.  offsets are 1-based in output stream
+void packed_depths(const Packer& packer, const string& path_name, size_t min_coverage, ostream& out_stream);
+
+/// Estimate the coverage along a given reference path interval [start_step, end_plus_one_step)
+/// Coverage is obtained only from positions along the path, and variation is not counted
+/// Except if "include_deletions" is true, then reference path positions covered by a deletion edge
+/// (which is contained in the bin) will get the deletion edge's coverage counted.
+/// Other types of events (such as SNPs) can throw off coverage in similar ways but deletions tend to be bigger
+/// (and easier to find), so we hope that counting them is enough.
+pair<double, double> packed_depth_of_bin(const Packer& packer, step_handle_t start_step, step_handle_t end_plus_one_step,
+                                         size_t min_coverage, bool include_deletions);
+
+/// Use all available threads to estimate the binned packed coverage of a path using above fucntion
+/// Each element is a bin's 0-based open-ended interval in the path, and its coverage mean,variance. 
+vector<tuple<size_t, size_t, double, double>> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size,
+                                                          size_t min_coverage, bool include_deletions);
+
+/// Return the mean and variance of coverage of randomly sampled nodes from a GAM
 /// Nodes with less than min_coverage are ignored
 /// The stream is scanned in parallel with all threads
 /// max_nodes is used to keep memory down
-double sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0);
+pair<double, double> sample_gam_depth(const HandleGraph& graph, istream& gam_stream, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq);
 
 /// As above, but read a vector instead of a stream
-double sample_gam_depth(const HandleGraph& graph, const vector<Alignment>& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage = 1.0);
+pair<double, double> sample_gam_depth(const HandleGraph& graph, const vector<Alignment>& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq);
 
 }
 }
diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp
index 7a892397a3e..0e5fd75b1e4 100644
--- a/src/subcommand/depth_main.cpp
+++ b/src/subcommand/depth_main.cpp
@@ -28,15 +28,16 @@ using namespace vg::subcommand;
 void help_depth(char** argv) {
     cerr << "usage: " << argv[0] << " depth [options] <graph>" << endl
          << "options:" << endl
-         << "  packed coverage depth:" << endl
-         << "    -k, --pack FILE       Supports created from vg pack for given input graph" << endl
-         << "    -p, --ref-path NAME   Reference path to call on (multipile allowed.  defaults to all paths)" << endl
-         << "    -c, --context-size N  Context size (steps) for expanding bin subgraphs [50]" << endl
-         << "    -b, --bin-size N      Bin size (in bases) [10000000]" << endl
-         << "  GAM coverage depth:" << endl
-         << "    -g, --gam FILE        read alignments from this file (could be '-' for stdin)" << endl
-         << "    -n, --max-nodes N     maximum nodes to consider [1000000]" << endl
-         << "    -s, --random-seed N   random seed for sampling nodes to consider" << endl
+         << "  packed coverage depth (print positional depths along path):" << endl
+         << "    -k, --pack FILE        Supports created from vg pack for given input graph" << endl
+         << "    -p, --ref-path NAME    Reference path to call on (multipile allowed.  defaults to all paths)" << endl
+         << "    -b, --bin-size N       Bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl
+         << "    -d, --count-dels       Count deletion edges within the bin as covering reference positions" << endl
+         << "  GAM coverage depth (print <mean> <stddev> for depth):" << endl
+         << "    -g, --gam FILE         read alignments from this file (could be '-' for stdin)" << endl
+         << "    -n, --max-nodes N      maximum nodes to consider [1000000]" << endl
+         << "    -s, --random-seed N    random seed for sampling nodes to consider" << endl
+         << "    -Q, --min-mapq N      ignore alignments with mapping quality < N" << endl
          << "  common options:" << endl
          << "    -m, --min-coverage N  ignore nodes with less than N coverage [1]" << endl
          << "    -t, --threads N       Number of threads to use [all available]" << endl;
@@ -51,12 +52,13 @@ int main_depth(int argc, char** argv) {
 
     string pack_filename;
     vector<string> ref_paths;
-    size_t context_steps = 50;
-    size_t bin_size = 10000000;
+    size_t bin_size = 1;
+    bool count_dels = false;
     
     string gam_filename;
     size_t max_nodes = 1000000;
     int random_seed = time(NULL);
+    size_t min_mapq = 0;
 
     size_t min_coverage = 1;
 
@@ -67,11 +69,12 @@ int main_depth(int argc, char** argv) {
         static const struct option long_options[] = {
             {"pack", required_argument, 0, 'k'},            
             {"ref-path", required_argument, 0, 'p'},
-            {"context-size", required_argument, 0, 'c'},
             {"bin-size", required_argument, 0, 'b'},
+            {"count-dels", no_argument, 0, 'd'},
             {"gam", required_argument, 0, 'g'},
             {"max-nodes", required_argument, 0, 'n'},
             {"random-seed", required_argument, 0, 's'},
+            {"min-mapq", required_argument, 0, 'Q'},
             {"min-coverage", required_argument, 0, 'm'},
             {"threads", required_argument, 0, 't'},
             {"help", no_argument, 0, 'h'},
@@ -79,7 +82,7 @@ int main_depth(int argc, char** argv) {
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hk:p:c:b:g:n:s:m:t:",
+        c = getopt_long (argc, argv, "hk:p:c:b:dg:n:s:m:t:",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -94,12 +97,12 @@ int main_depth(int argc, char** argv) {
         case 'p':
             ref_paths.push_back(optarg);
             break;            
-        case 'c':
-            context_steps = parse<size_t>(optarg);
-            break;
         case 'b':
             bin_size = parse<size_t>(optarg);
             break;
+        case 'd':
+            count_dels = true;
+            break;            
         case 'g':
             gam_filename = optarg;
             break;
@@ -109,6 +112,9 @@ int main_depth(int argc, char** argv) {
         case 's':
             random_seed = parse<size_t>(optarg);
             break;
+        case 'Q':
+            min_mapq = parse<size_t>(optarg);
+            break;
         case 'm':
             min_coverage = parse<size_t>(optarg);
             break;
@@ -172,25 +178,38 @@ int main_depth(int argc, char** argv) {
                         ref_paths.push_back(path_name);
                     }
                 });
+        } else {
+            for (const string& ref_name : ref_paths) {
+                if (!graph->has_path(ref_name)) {
+                    cerr << "error:[vg depth] Path \"" << ref_name << "\" not found in graph" << endl;
+                }
+            }
         }
+        
 
         for (const string& ref_path : ref_paths) {
-            map<size_t, double> binned_depth = algorithms::binned_packed_depth(*graph, *packer, ref_path, bin_size, get_thread_count());
-            for (auto& bin_cov : binned_depth) {
-                cerr << ref_path << "\t" << bin_cov.first << "\t" << bin_cov.second << endl;
+            if (bin_size > 1) {
+                vector<tuple<size_t, size_t, double, double>> binned_depth =
+                    algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels);
+                for (auto& bin_cov : binned_depth) {
+                    cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov)
+                         << "\t" << sqrt(get<3>(bin_cov)) << endl;
+                }
+            } else {
+                algorithms::packed_depths(*packer, ref_path, min_coverage, cout);
             }
         }
     }
 
     // Process the gam
     if (!gam_filename.empty()) {
-        double gam_cov;
+        pair<double, double> gam_cov;
         get_input_file(gam_filename, [&] (istream& gam_stream) {
-                gam_cov = algorithms::sample_gam_depth(*graph, gam_stream, max_nodes, random_seed, min_coverage);
+                gam_cov = algorithms::sample_gam_depth(*graph, gam_stream, max_nodes, random_seed, min_coverage, min_mapq);
             });
-        cerr << "gam-coverage\t" << gam_cov << endl;
+        cout << gam_cov.first << "\t" << sqrt(gam_cov.second) << endl;
     }
-
+        
     return 0;
 
 }
diff --git a/src/utility.cpp b/src/utility.cpp
index a9f72f12cf5..ee6c71ef205 100644
--- a/src/utility.cpp
+++ b/src/utility.cpp
@@ -336,6 +336,24 @@ double median(std::vector<int> &v) {
         return 0.5*(vn+v[n-1]);
     }
 }
+
+// from Python exmaple here:
+// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+void wellford_update(size_t& count, double& mean, double& M2, double new_val) {
+    ++count;
+    double delta = new_val - mean;
+    mean += delta / (double)count;
+    double delta2 = new_val - mean;
+    M2 += delta * delta2;
+}
+
+pair<double, double> wellford_mean_var(size_t count, double mean, double M2, bool sample_variance) {
+    if (count == 0 || (sample_variance && count == 1)) {
+        return make_pair(nan(""), nan(""));
+    } else {
+        return make_pair(mean, M2 / (double)(sample_variance ? count - 1 : count));
+    }
+}
     
 vector<size_t> range_vector(size_t begin, size_t end) {
     size_t len = end - begin;
diff --git a/src/utility.hpp b/src/utility.hpp
index 72c04c3144a..d3823b6c05b 100644
--- a/src/utility.hpp
+++ b/src/utility.hpp
@@ -51,6 +51,9 @@ string nonATGCNtoN(const string& s);
 string toUppercase(const string& s);
 double median(std::vector<int> &v);
 double stdev(const std::vector<double>& v);
+// Online mean-variance computation with wellfords algorithm (pass 0's to 1st 3 params to start)
+void wellford_update(size_t& count, double& mean, double& M2, double new_val);
+pair<double, double> wellford_mean_var(size_t count, double mean, double M2, bool sample_variance = false);
 
 // write a fasta sqeuence
 void write_fasta_sequence(const std::string& name, const std::string& sequence, ostream& os, size_t width=80);

From 4b664f53c35e5f3baa10aa1e7c28007c2b86ccf2 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 1 Nov 2019 16:57:21 -0700
Subject: [PATCH 14/79] Turn off debug output but leave improved provenance

---
 src/minimizer_mapper.cpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp
index 7cfe7bd0ce6..1b6079be4a5 100644
--- a/src/minimizer_mapper.cpp
+++ b/src/minimizer_mapper.cpp
@@ -15,7 +15,6 @@
 #include <algorithm>
 #include <cmath>
 
-
 namespace vg {
 
 using namespace std;
@@ -31,6 +30,10 @@ MinimizerMapper::MinimizerMapper(const gbwtgraph::GBWTGraph& graph, const gbwtgr
 
 void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
     // For each input alignment
+    
+#ifdef debug
+    cerr << "Read " << aln.name() << ": " << aln.sequence() << endl;
+#endif
 
     // Make a new funnel instrumenter to watch us map this read.
     Funnel funnel;
@@ -109,7 +112,17 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
         // of the selected minimizers is not high enough.
         size_t hits = minimizer_index.count(minimizers[minimizer_num]);
         
-        if (hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) {
+#ifdef debug
+        cerr << "Minimizer " << minimizer_num << " = " << minimizers[minimizer_num].key.decode(minimizer_index.k())
+            << " has " << hits << " hits" << endl;
+#endif
+        
+        if (hits == 0) {
+            // A minimizer with no hits can't go on.
+            if (track_provenance) {
+                funnel.fail("any-hits", minimizer_num);
+            }
+        } else if (hits <= hit_cap || (hits <= hard_hit_cap && selected_score + minimizer_score[minimizer_num] <= target_score)) {
             // Locate the hits.
             for (auto& hit : minimizer_index.find(minimizers[minimizer_num])) {
                 // Reverse the hits for a reverse minimizer
@@ -125,6 +138,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
             
             if (track_provenance) {
                 // Record in the funnel that this minimizer gave rise to these seeds.
+                funnel.pass("any-hits", minimizer_num);
                 funnel.pass("hard-hit-cap", minimizer_num);
                 funnel.pass("hit-cap||score-fraction", minimizer_num, selected_score  / base_target_score);
                 funnel.expand(minimizer_num, hits);
@@ -133,6 +147,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
             // Passed hard hit cap but failed score fraction/normal hit cap
             rejected_count++;
             if (track_provenance) {
+                funnel.pass("any-hits", minimizer_num);
                 funnel.pass("hard-hit-cap", minimizer_num);
                 funnel.fail("hit-cap||score-fraction", minimizer_num, (selected_score + minimizer_score[minimizer_num]) / base_target_score);
             }
@@ -140,6 +155,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
             // Failed hard hit cap
             rejected_count++;
             if (track_provenance) {
+                funnel.pass("any-hits", minimizer_num);
                 funnel.fail("hard-hit-cap", minimizer_num);
             }
         }
@@ -177,7 +193,6 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
     }
         
 #ifdef debug
-    cerr << "Read " << aln.name() << ": " << aln.sequence() << endl;
     cerr << "Found " << seeds.size() << " seeds from " << (minimizers.size() - rejected_count) << " minimizers, rejected " << rejected_count << endl;
 #endif
 
@@ -304,7 +319,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
             vector<size_t>& cluster = clusters[cluster_num];
 
 #ifdef debug
-            cerr << "Cluster " << cluster_num << " rank " << i << ": " << endl;
+            cerr << "Cluster " << cluster_num << endl;
 #endif
              
             // Pack the seeds for GaplessExtender.

From 5e984558fac66ef32553ffd7ad098428d956290b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2019 11:20:29 -0800
Subject: [PATCH 15/79] Make Giraffe Wrangler do vg stats to compare map and
 giraffe

---
 scripts/giraffe-wrangler.sh | 188 +++++++++++++++++++++++++-----------
 1 file changed, 129 insertions(+), 59 deletions(-)

diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh
index 9bce06a9bad..aeb2ce6d275 100755
--- a/scripts/giraffe-wrangler.sh
+++ b/scripts/giraffe-wrangler.sh
@@ -9,20 +9,39 @@ usage() {
     exec 1>&2
     printf "Usage: $0 [Options] FASTA XG_INDEX GCSA_INDEX GBWT_INDEX MINIMIZER_INDEX DISTANCE_INDEX SIM_GAM REAL_FASTQ\n"
     printf "\n"
+    printf "Inputs may be files or S3 URLs.\n"
+    printf "\n"
+    printf "Arguments:\n"
+    printf "  FASTA            FASTA reference to run bwa-mem against; may be \"\"\n"
+    printf "  XG_INDEX         XG to annotate reads with positions\n"
+    printf "  GCSA_INDEX       GCSA (with LCP) for running vg map\n"
+    printf "  GBWT_INDEX       Haplotypes for mapping with Giraffe\n"
+    printf "  MINIMIZER_INDEX  Minimizers for mapping with Giraffe\n"
+    printf "  DISTANCE_INDEX   Distances for mapping with Giraffe\n"
+    printf "  SIM_GAM          Simulated reads for measuring mapping accuracy; may be \"\"\n"
+    printf "  REAL_FASTQ       Real reads for measuring mapping performance; may be \"\"\n"
+    printf "\n"
     printf "Options:\n"
-    printf "  -t N  Use N threads\n"
+    printf "  -s DEST Save alignments and other internal files to DEST (directory or S3 url)\n"
+    printf "  -t N    Use N threads\n"
     printf "\n"
     exit 1
 }
 
+# Define where we should save our output
+OUTPUT_DEST=""
+
 # Define the thread count for everyone. Can be changed with -t.
 # Should fit on a NUMA node
 THREAD_COUNT=24
 
-while getopts ":t:" o; do
+while getopts ":s:t:" o; do
     case "${o}" in
+        s)
+            OUTPUT_DEST="${OPTARG}"
+            ;;
         t)
-            THREAD_COUNT=$OPTARG
+            THREAD_COUNT="${OPTARG}"
             ;;
         ?)
             usage
@@ -38,10 +57,10 @@ fi
 echo "Using ${THREAD_COUNT} threads"
 
 fetch_input() {
-    # Download the specified file, if not a file already.
+    # Download the specified file, if not empty and not a file already.
     # Dumps all files into the current directory as their basenames
     # Output the new filename
-    if [[ "${1}" == s3://* ]] ; then
+    if [[ ! -z "${1}" && "${1}" == s3://* ]] ; then
         aws s3 --quiet cp "${1}" "$(basename "${1}")"
         basename "${1}"
     else
@@ -50,10 +69,13 @@ fetch_input() {
 }
 
 FASTA="$(fetch_input "${1}")"
-for EXT in amb ann bwt fai pac sa ; do
-    # Make sure we have all the indexes adjacent to the FASTA
-    fetch_input "${1}.${EXT}" >/dev/null
-done
+if [[ ! -z ${FASTA} ]] ; then
+    # We have a FASTA
+    for EXT in amb ann bwt fai pac sa ; do
+        # Make sure we have all the indexes adjacent to the FASTA
+        fetch_input "${1}.${EXT}" >/dev/null
+    done
+fi
 shift
 XG_INDEX="$(fetch_input "${1}")"
 # Make sure we have the GBWTGraph pre-made
@@ -112,60 +134,84 @@ if [[ "${NUMA_COUNT}" -gt "1" ]] ; then
     fi
 fi
 
-if which perf >/dev/null 2>&1 ; then
-    # Record profile.
-    # Do this first because perf is likely to be misconfigured and we want to fail fast.
-    
-    # If we don't strip bin/vg to make it small, the addr2line calls that perf
-    # script makes take forever because the binary is huge
-    strip -d bin/vg
-    
-    ${NUMA_PREFIX} perf record -F 100 --call-graph dwarf -o "${WORK}/perf.data"  vg gaffe "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/perf.gam"
-    perf script -i "${WORK}/perf.data" >"${WORK}/out.perf"
-    deps/FlameGraph/stackcollapse-perf.pl "${WORK}/out.perf" >"${WORK}/out.folded"
-    deps/FlameGraph/flamegraph.pl "${WORK}/out.folded" > "${WORK}/profile.svg"
+if [[ ! -z "${REAL_FASTQ}" ]] ; then
+    if which perf >/dev/null 2>&1 ; then
+        # Record profile.
+        # Do this first because perf is likely to be misconfigured and we want to fail fast.
+        
+        # If we don't strip bin/vg to make it small, the addr2line calls that perf
+        # script makes take forever because the binary is huge
+        strip -d bin/vg
+        
+        ${NUMA_PREFIX} perf record -F 100 --call-graph dwarf -o "${WORK}/perf.data"  vg gaffe "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/perf.gam"
+        perf script -i "${WORK}/perf.data" >"${WORK}/out.perf"
+        deps/FlameGraph/stackcollapse-perf.pl "${WORK}/out.perf" >"${WORK}/out.folded"
+        deps/FlameGraph/flamegraph.pl "${WORK}/out.folded" > "${WORK}/profile.svg"
+    fi
 fi
 
-# Run simulated reads, with stats
-${NUMA_PREFIX} vg gaffe --track-correctness -x "${XG_INDEX}" "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/mapped.gam"
+if [[ ! -z "${SIM_GAM}" ]] ; then
+    # Do simulated reads
+
+    # Run simulated reads, with stats
+    ${NUMA_PREFIX} vg gaffe --track-correctness -x "${XG_INDEX}" "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/mapped.gam"
+
+    # And map to compare with them
+    ${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" >"${WORK}/mapped-map.gam"
+
+    # Annotate and compare against truth
+    vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped.gam" >"${WORK}/annotated.gam"
+    vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped-map.gam" >"${WORK}/annotated-map.gam"
+
+    # GAM compare against truth. Use gamcompare to count correct reads to save a JSON scan.
+    CORRECT_COUNT="$(vg gamcompare -r 100 "${WORK}/annotated.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')"
+    CORRECT_COUNT_MAP="$(vg gamcompare -r 100 "${WORK}/annotated-map.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')"
+
+    # Compute identity of mapped reads
+    MEAN_IDENTITY="$(vg view -aj "${WORK}/mapped.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')"
+    MEAN_IDENTITY_MAP="$(vg view -aj "${WORK}/mapped-map.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')"
 
-# And map to compare with them
-${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" >"${WORK}/mapped-map.gam"
+    # Compute loss stages
+    # Let giraffe facts errors out
+    vg view -aj "${WORK}/mapped.gam" | scripts/giraffe-facts.py "${WORK}/facts" >"${WORK}/facts.txt"
+fi
 
-# Annotate and compare against truth
-vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped.gam" >"${WORK}/annotated.gam"
-vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped-map.gam" >"${WORK}/annotated-map.gam"
+if [[ ! -z "${REAL_FASTQ}" ]] ; then
+    # Now do the real reads
 
-# GAM compare against truth. Use gamcompare to count correct reads to save a JSON scan.
-CORRECT_COUNT="$(vg gamcompare -r 100 "${WORK}/annotated.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')"
-CORRECT_COUNT_MAP="$(vg gamcompare -r 100 "${WORK}/annotated-map.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')"
+    # Count them
+    REAL_READ_COUNT="$(cat "${REAL_FASTQ}" | wc -l)"
+    ((REAL_READ_COUNT /= 4))
 
-# Compute identity of mapped reads
-MEAN_IDENTITY="$(vg view -aj "${WORK}/mapped.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')"
-MEAN_IDENTITY_MAP="$(vg view -aj "${WORK}/mapped-map.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')"
+    # Get RPS for Giraffe
+    ${NUMA_PREFIX} vg gaffe -p "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/real.gam" 2>"${WORK}/log.txt"
 
-# Compute loss stages
-# Let giraffe facts errors out
-vg view -aj "${WORK}/mapped.gam" | scripts/giraffe-facts.py "${WORK}/facts" >"${WORK}/facts.txt"
+    GIRAFFE_RPS="$(cat "${WORK}/log.txt" | grep "reads per second" | sed 's/[^0-9.]//g')"
 
-# Now do the real reads
+    if [[ ! -z "${FASTA}" ]] ; then
+        # Get RPS for bwa-mem
 
-# Get RPS
-${NUMA_PREFIX} vg gaffe -p "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/real.gam" 2>"${WORK}/log.txt"
+        ${NUMA_PREFIX} bwa mem -t "${THREAD_COUNT}" "${FASTA}" "${REAL_FASTQ}" >"${WORK}/mapped.bam" 2>"${WORK}/bwa-log.txt"
 
-GIRAFFE_RPS="$(cat "${WORK}/log.txt" | grep "reads per second" | sed 's/[^0-9.]//g')"
+        # Now we get all the batch times from BWA and use those to compute RPS values.
+        # This is optimistic but hopefully consistent.
+        BWA_RPS_ALL_THREADS="$(cat "${WORK}/bwa-log.txt" | grep "Processed" | sed 's/[^0-9]*\([0-9]*\) reads in .* CPU sec, \([0-9]*\.[0-9]*\) real sec/\1 \2/g' | tr ' ' '\t' | awk '{sum1+=$1; sum2+=$2} END {print sum1/sum2}')"
 
-# Get RPS for bwa-mem
-REAL_READ_COUNT="$(cat "${REAL_FASTQ}" | wc -l)"
-((REAL_READ_COUNT /= 4))
+        BWA_RPS="$(echo "${BWA_RPS_ALL_THREADS} / ${THREAD_COUNT}" | bc -l)"
+        
+    fi
 
-${NUMA_PREFIX} bwa mem -t "${THREAD_COUNT}" "${FASTA}" "${REAL_FASTQ}" >"${WORK}/mapped.bam" 2>"${WORK}/bwa-log.txt"
+    # Align the real reads with map, ignoring speed
+    ${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" >"${WORK}/real-map.gam"
 
-# Now we get all the batch times from BWA and use those to compute RPS values.
-# This is optimistic but hopefully consistent.
-BWA_RPS_ALL_THREADS="$(cat "${WORK}/bwa-log.txt" | grep "Processed" | sed 's/[^0-9]*\([0-9]*\) reads in .* CPU sec, \([0-9]*\.[0-9]*\) real sec/\1 \2/g' | tr ' ' '\t' | awk '{sum1+=$1; sum2+=$2} END {print sum1/sum2}')"
+    # Compute stats for giraffe and map on real reads
+    echo "Real read stats:" >"${WORK}/real-stats.txt"
+    echo "Giraffe:" >>"${WORK}/real-stats.txt"
+    vg stats -a "${WORK}/real.gam" >>"${WORK}/real-stats.txt" 2>&1
+    echo "Map:" >>"${WORK}/real-stats.txt"
+    vg stats -a "${WORK}/real-map.gam" >>"${WORK}/real-stats.txt" 2>&1
+fi
 
-BWA_RPS="$(echo "${BWA_RPS_ALL_THREADS} / ${THREAD_COUNT}" | bc -l)"
 
 echo "==== Giraffe Wrangler Report for vg $(vg version -s) ===="
 
@@ -173,20 +219,44 @@ if [[ "${NUMA_WARNING}" == "1" ]] ; then
     echo "WARNING! Unable to restrict to a single NUMA node! Results may have high variance!"
 fi
 
-if which perf >/dev/null 2>&1 ; then
-    # Output perf stuff
-    mv "${WORK}/perf.data" ./perf.data
-    mv "${WORK}/profile.svg" ./profile.svg
-    echo "Profiling information saved as ./perf.data"
-    echo "Interactive flame graph (for browsers) saved as ./profile.svg"
+if [[ ! -z "${REAL_FASTQ}" ]] ; then
+    if which perf >/dev/null 2>&1 ; then
+        # Output perf stuff
+        mv "${WORK}/perf.data" ./perf.data
+        mv "${WORK}/profile.svg" ./profile.svg
+        echo "Profiling information saved as ./perf.data"
+        echo "Interactive flame graph (for browsers) saved as ./profile.svg"
+    fi
 fi
 
 # Print the report
-echo "Giraffe got ${CORRECT_COUNT} simulated reads correct with ${MEAN_IDENTITY} average identity per mapped base"
-echo "Map got ${CORRECT_COUNT_MAP} simulated reads correct with ${MEAN_IDENTITY_MAP} average identity per mapped base"
-echo "Giraffe aligned real reads at ${GIRAFFE_RPS} reads/second vs. bwa-mem's ${BWA_RPS} reads/second on ${THREAD_COUNT} threads"
+if [[ ! -z "${SIM_GAM}" ]] ; then
+    # Include simulated reads
+    echo "Giraffe got ${CORRECT_COUNT} simulated reads correct with ${MEAN_IDENTITY} average identity per mapped base"
+    echo "Map got ${CORRECT_COUNT_MAP} simulated reads correct with ${MEAN_IDENTITY_MAP} average identity per mapped base"
+fi
+if [[ ! -z "${REAL_FASTQ}" ]] ; then
+    # Include real reads
+    echo "Giraffe aligned real reads at ${GIRAFFE_RPS} reads/second on ${THREAD_COUNT} threads"
+    if [[ ! -z "${FASTA}" ]] ; then
+        echo "bwa-mem aligned real reads at ${BWA_RPS} reads/second on ${THREAD_COUNT} threads"
+    fi
+fi
+
+if [[ ! -z "${SIM_GAM}" ]] ; then
+    # Print Giraffe Facts for simulated reads
+    cat "${WORK}/facts.txt"
+fi
 
-cat "${WORK}/facts.txt"
+if [[ ! -z "${REAL_FASTQ}" ]] ; then
+    # Print real read stats
+    cat "${WORK}/real-stats.txt"
+fi
+
+if [[ ! -z "${OUTPUT_DEST}" ]] ; then
+    # Save our intermediates
+    aws s3 cp --recursive "${WORK}" "${OUTPUT_DEST}"
+fi
 
 rm -Rf "${WORK}"
 

From 9616c75c83e010576563cbdf67c5ad972692c83b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2019 11:45:20 -0800
Subject: [PATCH 16/79] Handle case where libsdsl.a exists but shouldn't

`$(LIB_DIR)/libdivsufsort.a` and `$(LIB_DIR)/libdivsufsort64.a` are made by the SDSL build process but might not exist sometimes in the Travis cache.

See https://stackoverflow.com/q/3046117
---
 Makefile | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 55e69cb8494..f087d42fd12 100644
--- a/Makefile
+++ b/Makefile
@@ -381,8 +381,16 @@ ifeq ($(shell uname -s),Darwin)
 	+. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER)
 else
 	+. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER)
-endif	
+endif
+
+# Make sure the divsufsort libraries also come from SDSL
+$(LIB_DIR)/libdivsufsort.a: $(LIB_DIR)/libsdsl.a
+	@
 
+$(LIB_DIR)/libdivsufsort64.a: $(LIB_DIR)/libsdsl.a
+	@
+	
+.SECONDARY: $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a
 
 $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.h
 	+. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(LIB_DIR)
@@ -397,7 +405,7 @@ $(LIB_DIR)/librocksdb.a: $(LIB_DIR)/libsnappy.a $(ROCKSDB_DIR)/db/*.cc $(ROCKSDB
 
 $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a
 
-$(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h)
+$(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h)
 ifeq ($(shell uname -s),Darwin)
 	+. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) libgcsa2.a $(FILTER) && mv libgcsa2.a $(CWD)/$(LIB_DIR)
 else
@@ -406,7 +414,7 @@ endif
 
 $(INC_DIR)/gbwt/dynamic_gbwt.h: $(LIB_DIR)/libgbwt.a
 
-$(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GBWT_DIR)/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h)
+$(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GBWT_DIR)/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h)
 ifeq ($(shell uname -s),Darwin)
 	+. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv libgbwt.a $(CWD)/$(LIB_DIR)
 else
@@ -415,7 +423,7 @@ endif
 
 $(INC_DIR)/gbwtgraph/gbwtgraph.h: $(LIB_DIR)/libgbwtgraph.a
 
-$(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h)
+$(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h)
 ifeq ($(shell uname -s),Darwin)
 	+. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv libgbwtgraph.a $(CWD)/$(LIB_DIR)
 else
@@ -614,7 +622,7 @@ $(LIB_DIR)/libfml.a: $(FERMI_DIR)/*.h $(FERMI_DIR)/*.c
 $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB_DIR)/libhts.a
 	. ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/
 
-$(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/src/*.cpp $(LIBBDSG_DIR)/include/bdsg/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp
+$(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/src/*.cpp $(LIBBDSG_DIR)/include/bdsg/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp
 	+. ./source_me.sh  && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && pwd && cp -r include/bdsg $(CWD)/$(INC_DIR)
 
 $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp
@@ -626,7 +634,7 @@ $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/*
 
 # The xg repo has a cmake build system based all around external projects, and
 # we need it to use our installed versions of everything instead.
-$(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(INC_DIR)/gfakluge.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a
+$(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(INC_DIR)/gfakluge.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a
 	+rm -f $@
 	+cp -r $(XG_DIR)/src/*.hpp $(CWD)/$(INC_DIR)
 	+. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER)

From 4c8aa98c569cba26c9708ef5138a38051e2adf8c Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 4 Nov 2019 15:43:43 -0500
Subject: [PATCH 17/79] small test for vg depth

---
 src/algorithms/coverage_depth.cpp |  4 +++-
 src/algorithms/coverage_depth.hpp |  2 ++
 src/subcommand/depth_main.cpp     | 21 ++++++++++++---------
 test/t/49_vg_depth.t              | 21 +++++++++++++++++++++
 4 files changed, 38 insertions(+), 10 deletions(-)
 create mode 100644 test/t/49_vg_depth.t

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index 351ed63e1f0..b68b8c50b55 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -23,7 +23,9 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov
         for (size_t i = 0; i < cur_len; ++i) {
             cur_pos.set_offset(i);
             size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos));
-            out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n";
+            if (pos_coverage >= min_coverage) {
+                out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n";
+            }
             ++path_offset;
         }
     }
diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp
index 18d70daa26a..730cd316f5a 100644
--- a/src/algorithms/coverage_depth.hpp
+++ b/src/algorithms/coverage_depth.hpp
@@ -27,6 +27,8 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov
 /// (which is contained in the bin) will get the deletion edge's coverage counted.
 /// Other types of events (such as SNPs) can throw off coverage in similar ways but deletions tend to be bigger
 /// (and easier to find), so we hope that counting them is enough.
+/// If one wants to infer deletions from the coverage, obviously this should be false, but if looking for
+/// a background coverage for genotyping, then setting it to true may be helpful
 pair<double, double> packed_depth_of_bin(const Packer& packer, step_handle_t start_step, step_handle_t end_plus_one_step,
                                          size_t min_coverage, bool include_deletions);
 
diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp
index 0e5fd75b1e4..d60f53177b9 100644
--- a/src/subcommand/depth_main.cpp
+++ b/src/subcommand/depth_main.cpp
@@ -29,18 +29,18 @@ void help_depth(char** argv) {
     cerr << "usage: " << argv[0] << " depth [options] <graph>" << endl
          << "options:" << endl
          << "  packed coverage depth (print positional depths along path):" << endl
-         << "    -k, --pack FILE        Supports created from vg pack for given input graph" << endl
-         << "    -p, --ref-path NAME    Reference path to call on (multipile allowed.  defaults to all paths)" << endl
-         << "    -b, --bin-size N       Bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl
-         << "    -d, --count-dels       Count deletion edges within the bin as covering reference positions" << endl
+         << "    -k, --pack FILE        supports created from vg pack for given input graph" << endl
+         << "    -p, --ref-path NAME    reference path to call on (multipile allowed.  defaults to all paths)" << endl
+         << "    -b, --bin-size N       bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl
+         << "    -d, --count-dels       count deletion edges within the bin as covering reference positions" << endl
          << "  GAM coverage depth (print <mean> <stddev> for depth):" << endl
          << "    -g, --gam FILE         read alignments from this file (could be '-' for stdin)" << endl
          << "    -n, --max-nodes N      maximum nodes to consider [1000000]" << endl
          << "    -s, --random-seed N    random seed for sampling nodes to consider" << endl
-         << "    -Q, --min-mapq N      ignore alignments with mapping quality < N" << endl
+         << "    -Q, --min-mapq N       ignore alignments with mapping quality < N [0]" << endl
          << "  common options:" << endl
-         << "    -m, --min-coverage N  ignore nodes with less than N coverage [1]" << endl
-         << "    -t, --threads N       Number of threads to use [all available]" << endl;
+         << "    -m, --min-coverage N   ignore nodes with less than N coverage [1]" << endl
+         << "    -t, --threads N        number of threads to use [all available]" << endl;
 }
 
 int main_depth(int argc, char** argv) {
@@ -192,8 +192,11 @@ int main_depth(int argc, char** argv) {
                 vector<tuple<size_t, size_t, double, double>> binned_depth =
                     algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels);
                 for (auto& bin_cov : binned_depth) {
-                    cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov)
-                         << "\t" << sqrt(get<3>(bin_cov)) << endl;
+                    // bins can ben nan if min_coverage filters everything out.  just skip
+                    if (!isnan(get<3>(bin_cov))) {
+                        cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov)
+                             << "\t" << sqrt(get<3>(bin_cov)) << endl;
+                    }
                 }
             } else {
                 algorithms::packed_depths(*packer, ref_path, min_coverage, cout);
diff --git a/test/t/49_vg_depth.t b/test/t/49_vg_depth.t
new file mode 100644
index 00000000000..fb2d46d29fc
--- /dev/null
+++ b/test/t/49_vg_depth.t
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+BASH_TAP_ROOT=../deps/bash-tap
+. ../deps/bash-tap/bash-tap-bootstrap
+
+PATH=../bin:$PATH # for vg
+
+plan tests 3
+
+vg construct -m 10 -r tiny/tiny.fa >flat.vg
+vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTTTCTGGAGATCTATTATACTCCAACTCTCTG/' | vg view -Fv - >2snp.vg
+vg sim -l 30 -x 2snp.vg -n 30 -a >2snp.sim
+vg index -x flat.xg -g flat.gcsa -k 16 flat.vg
+vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam
+vg pack -x flat.xg -o 2snp.gam.cx -g 2snp.gam
+# total read bases (30 * 30) / total graph bases 50 = 18
+is $(vg depth flat.vg -g 2snp.gam | awk '{print $1}') 18 "vg depth gets correct depth from gam"
+is $(vg depth flat.xg -k 2snp.gam.cx -b 100000 | awk '{print $4}') 18 "vg depth gets correct depth from pack"
+is $(vg depth flat.xg -k 2snp.gam.cx -b 10 | wc -l) 5 "vg depth gets correct number of bins"
+
+rm -f flat.vg flat.gcsa flat.xg 2snp.vg 2snp.sim 2snp.gam 2snp.gam.cx

From 0b819df32af63490dd9a2e77286d5d49f47bd150 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2019 13:19:06 -0800
Subject: [PATCH 18/79] Check if SDSL build actually happened, and re-copy

---
 Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index f087d42fd12..ec110cfcd6c 100644
--- a/Makefile
+++ b/Makefile
@@ -376,7 +376,7 @@ test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/json2pb.h $
 $(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c
 	+. ./source_me.sh && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) $(FILTER) && cp -r lib/* $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/
 
-$(LIB_DIR)/libsdsl.a: $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp
+$(LIB_DIR)/libsdsl.a: $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp $(SDSL_DIR)/build/lib/libsdsl.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a
 ifeq ($(shell uname -s),Darwin)
 	+. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER)
 else
@@ -384,11 +384,12 @@ else
 endif
 
 # Make sure the divsufsort libraries also come from SDSL
+# They might get deleted after libsdsl is installed
 $(LIB_DIR)/libdivsufsort.a: $(LIB_DIR)/libsdsl.a
-	@
+	cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(LIB_DIR)/libdivsufsort.a
 
 $(LIB_DIR)/libdivsufsort64.a: $(LIB_DIR)/libsdsl.a
-	@
+	cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a $(LIB_DIR)/libdivsufsort64.a
 	
 .SECONDARY: $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a
 

From f280cb4cfffe9250457be53e45ab7817f5077933 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2019 13:19:54 -0800
Subject: [PATCH 19/79] Catch if distance index is missing snarls at use time

---
 scripts/giraffe-wrangler.sh | 2 +-
 src/min_distance.hpp        | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh
index aeb2ce6d275..4e30c85df28 100755
--- a/scripts/giraffe-wrangler.sh
+++ b/scripts/giraffe-wrangler.sh
@@ -13,7 +13,7 @@ usage() {
     printf "\n"
     printf "Arguments:\n"
     printf "  FASTA            FASTA reference to run bwa-mem against; may be \"\"\n"
-    printf "  XG_INDEX         XG to annotate reads with positions\n"
+    printf "  XG_INDEX         XG to annotate reads with positions, with corresponding .gg GBWTGraph\n"
     printf "  GCSA_INDEX       GCSA (with LCP) for running vg map\n"
     printf "  GBWT_INDEX       Haplotypes for mapping with Giraffe\n"
     printf "  MINIMIZER_INDEX  Minimizers for mapping with Giraffe\n"
diff --git a/src/min_distance.hpp b/src/min_distance.hpp
index 7bcdc95bdf6..20ea9f6d715 100644
--- a/src/min_distance.hpp
+++ b/src/min_distance.hpp
@@ -385,8 +385,15 @@ class MinimumDistanceIndex {
                 pair<size_t, bool> common_ancestor, pos_t& pos, bool rev) const;
 
 
-    //Get the index into chain_indexes/rank in chain of node i
+    /// Get the index into chain_indexes/rank in chain of node i.
+    /// Detects and throws an error if node i never got assigned to a snarl.
     size_t getPrimaryAssignment(id_t i) const {
+        auto stored = primary_snarl_assignments[i - min_node_id];
+        if (stored == 0) {
+            // Somebody asked for a node. It should be assigned to a snarl, but it isn't.
+            throw runtime_error("Node " + std::to_string(i) + " not in any snarl. Distance index does " +
+                                "not match graph or was not generated from a snarl set including trivial snarls.");
+        }
         return primary_snarl_assignments[i - min_node_id] - 1;
     }
 

From ce6b5e6bc3a7d928d0e743786bb6427cf4f899ed Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 4 Nov 2019 16:48:53 -0500
Subject: [PATCH 20/79] make sim test more robust

---
 test/t/49_vg_depth.t | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/t/49_vg_depth.t b/test/t/49_vg_depth.t
index fb2d46d29fc..c77a34a82d4 100644
--- a/test/t/49_vg_depth.t
+++ b/test/t/49_vg_depth.t
@@ -9,13 +9,13 @@ plan tests 3
 
 vg construct -m 10 -r tiny/tiny.fa >flat.vg
 vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTTTCTGGAGATCTATTATACTCCAACTCTCTG/' | vg view -Fv - >2snp.vg
-vg sim -l 30 -x 2snp.vg -n 30 -a >2snp.sim
+vg sim -l 30 -x 2snp.vg -n 30 -a -s 1 >2snp.sim
 vg index -x flat.xg -g flat.gcsa -k 16 flat.vg
 vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam
 vg pack -x flat.xg -o 2snp.gam.cx -g 2snp.gam
 # total read bases (30 * 30) / total graph bases 50 = 18
 is $(vg depth flat.vg -g 2snp.gam | awk '{print $1}') 18 "vg depth gets correct depth from gam"
-is $(vg depth flat.xg -k 2snp.gam.cx -b 100000 | awk '{print $4}') 18 "vg depth gets correct depth from pack"
+is $(vg depth flat.xg -k 2snp.gam.cx -b 100000 | awk '{print int($4)}') 18 "vg depth gets correct depth from pack"
 is $(vg depth flat.xg -k 2snp.gam.cx -b 10 | wc -l) 5 "vg depth gets correct number of bins"
 
 rm -f flat.vg flat.gcsa flat.xg 2snp.vg 2snp.sim 2snp.gam 2snp.gam.cx

From 6047991af595ebdc25a8b9396a18ddad640aeffb Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2019 14:20:01 -0800
Subject: [PATCH 21/79] Use pattern rule to express multiple SDSL libs

---
 Makefile | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index ec110cfcd6c..a99320eb884 100644
--- a/Makefile
+++ b/Makefile
@@ -376,23 +376,16 @@ test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/json2pb.h $
 $(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c
 	+. ./source_me.sh && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) $(FILTER) && cp -r lib/* $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/
 
-$(LIB_DIR)/libsdsl.a: $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp $(SDSL_DIR)/build/lib/libsdsl.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a
+# Use fake patterns to tell Make that this rule generates all these files when run once.
+# Here % should always match "lib" which is a common substring.
+# See https://stackoverflow.com/a/19822767
+$(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp
 ifeq ($(shell uname -s),Darwin)
 	+. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER)
 else
 	+. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER)
 endif
 
-# Make sure the divsufsort libraries also come from SDSL
-# They might get deleted after libsdsl is installed
-$(LIB_DIR)/libdivsufsort.a: $(LIB_DIR)/libsdsl.a
-	cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort.a $(LIB_DIR)/libdivsufsort.a
-
-$(LIB_DIR)/libdivsufsort64.a: $(LIB_DIR)/libsdsl.a
-	cp $(SDSL_DIR)/build/external/libdivsufsort/lib/libdivsufsort64.a $(LIB_DIR)/libdivsufsort64.a
-	
-.SECONDARY: $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a
-
 $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.h
 	+. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(LIB_DIR)
 

From 07a23d9ce3e6249fb53b384a1b8a8c9df2135214 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2019 14:21:44 -0800
Subject: [PATCH 22/79] Don't try and use aws s3 to move files locally

---
 scripts/giraffe-wrangler.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh
index 4e30c85df28..908d21e9f08 100755
--- a/scripts/giraffe-wrangler.sh
+++ b/scripts/giraffe-wrangler.sh
@@ -254,8 +254,13 @@ if [[ ! -z "${REAL_FASTQ}" ]] ; then
 fi
 
 if [[ ! -z "${OUTPUT_DEST}" ]] ; then
-    # Save our intermediates
-    aws s3 cp --recursive "${WORK}" "${OUTPUT_DEST}"
+    if [[ "${OUTPUT_DEST}" == s3://* ]] ; then
+        # Save our intermediates to S3
+        aws s3 cp --recursive "${WORK}" "${OUTPUT_DEST}"
+    else
+        # Save our intermediates to disk
+        cp -R "${WORK}" "${OUTPUT_DEST}"
+    fi
 fi
 
 rm -Rf "${WORK}"

From a7a1220d580651b6fe502a1a3921915c049780a3 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Mon, 4 Nov 2019 15:48:38 -0800
Subject: [PATCH 23/79] added bidirectional gbwt option

---
 src/subcommand/rna_main.cpp | 13 ++++++++++---
 src/transcriptome.cpp       |  4 ++--
 src/transcriptome.hpp       |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp
index c40151dd5a5..1d180231ded 100644
--- a/src/subcommand/rna_main.cpp
+++ b/src/subcommand/rna_main.cpp
@@ -33,6 +33,7 @@ void help_rna(char** argv) {
          << "    -a, --add-non-ref-paths    add non-reference transcripts as embedded paths in the splice graph" << endl
          << "    -u, --out-ref-paths        output reference transcripts in GBWT, fasta and info" << endl
          << "    -b, --write-gbwt FILE      write transcripts as threads to GBWT index file" << endl
+         << "    -g, --gbwt-bidirectional   add transcripts as bidirectional threads to GBWT index" << endl
          << "    -f, --write-fasta FILE     write transcripts as sequences to fasta file" << endl
          << "    -i, --write-info FILE      write transcript origin info to tsv file" << endl
          << "    -t, --threads INT          number of compute threads to use [1]" << endl
@@ -59,6 +60,7 @@ int32_t main_rna(int32_t argc, char** argv) {
     bool add_non_reference_transcript_paths = false;
     bool output_reference_transcript_paths = false;
     string gbwt_out_filename = "";
+    bool gbwt_add_bidirectional = false;
     string fasta_out_filename = "";
     string info_out_filename = "";
     int32_t num_threads = 1;
@@ -81,6 +83,7 @@ int32_t main_rna(int32_t argc, char** argv) {
                 {"add-non-ref-paths",  no_argument, 0, 'a'},
                 {"out-ref-paths",  no_argument, 0, 'u'},           
                 {"write-gbwt",  no_argument, 0, 'b'},
+                {"gbwt-bidirectional",  no_argument, 0, 'g'},
                 {"write-fasta",  no_argument, 0, 'f'},
                 {"write-info",  no_argument, 0, 'i'},
                 {"threads",  no_argument, 0, 't'},
@@ -90,7 +93,7 @@ int32_t main_rna(int32_t argc, char** argv) {
             };
 
         int32_t option_index = 0;
-        c = getopt_long(argc, argv, "n:s:l:ercdoraub:f:i:t:ph?", long_options, &option_index);
+        c = getopt_long(argc, argv, "n:s:l:ercdoraub:gf:i:t:ph?", long_options, &option_index);
 
         /* Detect the end of the options. */
         if (c == -1)
@@ -143,6 +146,10 @@ int32_t main_rna(int32_t argc, char** argv) {
             gbwt_out_filename = optarg;
             break;
 
+        case 'g':
+            gbwt_add_bidirectional = true;
+            break;
+
         case 'f':
             fasta_out_filename = optarg;
             break;
@@ -283,13 +290,13 @@ int32_t main_rna(int32_t argc, char** argv) {
     // Construct and write GBWT index of transcript paths in transcriptome.
     if (!gbwt_out_filename.empty()) {
 
-        if (show_progress) { cerr << "[vg rna] Writing transcripts as threads to GBWT index file ..." << endl; }
+        if (show_progress) { cerr << "[vg rna] Writing transcripts as " << ((gbwt_add_bidirectional) ? "bidirectional " : "") << "threads to GBWT index file ..." << endl; }
 
         // Silence GBWT index construction. 
         gbwt::Verbosity::set(gbwt::Verbosity::SILENT); 
         gbwt::GBWTBuilder gbwt_builder(gbwt::bit_length(gbwt::Node::encode(transcriptome.splice_graph().max_node_id(), true)));
 
-        transcriptome.construct_gbwt(&gbwt_builder, output_reference_transcript_paths);
+        transcriptome.construct_gbwt(&gbwt_builder, output_reference_transcript_paths, gbwt_add_bidirectional);
 
         // Finish contruction and recode index.
         gbwt_builder.finish();
diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index bf2f1ef4b07..7c7e4866677 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -1109,7 +1109,7 @@ void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const
     }
 }
 
-void Transcriptome::construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts) const {
+void Transcriptome::construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts, const bool add_bidirectional) const {
 
     vector<string> sample_names;
     sample_names.reserve(size());
@@ -1131,7 +1131,7 @@ void Transcriptome::construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool
             }
 
             // Insert transcript path as thread into GBWT index.
-            gbwt_builder->insert(gbwt_thread, false);
+            gbwt_builder->insert(gbwt_thread, add_bidirectional);
 
             // Insert transcript path name into GBWT index.
             gbwt_builder->index.metadata.addPath({static_cast<gbwt::PathName::path_name_type>(sample_names.size()), 0, 0, 0});
diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp
index 3abbd1210e9..2ace89c60af 100644
--- a/src/transcriptome.hpp
+++ b/src/transcriptome.hpp
@@ -125,7 +125,7 @@ class Transcriptome {
         void embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths, const bool rebuild_indexes);
 
         /// Add transcript paths as threads in GBWT index.
-        void construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts) const;
+        void construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts, const bool add_bidirectional) const;
         
         /// Writes transcript paths as alignments to a gam file.
         void write_alignments(ostream * gam_ostream, const bool output_reference_transcripts) const;

From f17fce05ee96a2cf0bba1038546011631f1bd3a1 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 5 Nov 2019 13:45:48 -0500
Subject: [PATCH 24/79] refactor support computation apart from genotyping

---
 src/graph_caller.cpp         |  10 +-
 src/snarl_caller.cpp         | 339 +++--------------------------------
 src/snarl_caller.hpp         |  79 +-------
 src/subcommand/call_main.cpp |   9 +-
 src/traversal_support.cpp    | 321 +++++++++++++++++++++++++++++++++
 src/traversal_support.hpp    | 113 ++++++++++++
 6 files changed, 476 insertions(+), 395 deletions(-)
 create mode 100644 src/traversal_support.cpp
 create mode 100644 src/traversal_support.hpp

diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index 3da529f2d3f..4387ef0b63d 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -296,8 +296,8 @@ LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph,
                                                              0,
                                                              0,
                                                              get_path_index,
-                                                             [&](id_t id) { return snarl_caller.get_min_node_support(id);},
-                                                             [&](edge_t edge) { return snarl_caller.get_edge_support(edge);});
+                                                             [&](id_t id) { return snarl_caller.get_support_finder().get_min_node_support(id);},
+                                                             [&](edge_t edge) { return snarl_caller.get_support_finder().get_edge_support(edge);});
 
     } else {
         // our graph is not in vg format.  we will make graphs for each site as needed and work with those
@@ -364,7 +364,7 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) {
         // determine the support threshold for the traversal finder.  if we're using average
         // support, then we don't use any (set to 0), other wise, use the minimum support for a call
         SupportBasedSnarlCaller& support_caller = dynamic_cast<SupportBasedSnarlCaller&>(snarl_caller);
-        size_t threshold = support_caller.get_average_traversal_support_switch_threshold();
+        size_t threshold = support_caller.get_support_finder().get_average_traversal_support_switch_threshold();
         double support_cutoff = total_snarl_length <= threshold ? support_caller.get_min_total_support_for_call() : 0;
         rep_trav_finder = new RepresentativeTraversalFinder(vg_graph, snarl_manager,
                                                             max_search_depth,
@@ -373,10 +373,10 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) {
                                                             support_cutoff,
                                                             support_cutoff,
                                                             get_path_index,
-                                                            [&](id_t id) { return support_caller.get_min_node_support(id);},
+                                                            [&](id_t id) { return support_caller.get_support_finder().get_min_node_support(id);},
                                                             // note: because our traversal finder and support caller have
                                                             // different graphs, they can't share edge handles
-                                                            [&](edge_t edge) { return support_caller.get_edge_support(
+                                                            [&](edge_t edge) { return support_caller.get_support_finder().get_edge_support(
                                                                     vg_graph.get_id(edge.first), vg_graph.get_is_reverse(edge.first),
                                                                     vg_graph.get_id(edge.second), vg_graph.get_is_reverse(edge.second));});
                                                             
diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index b323785e679..e13c66fb6e9 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -13,9 +13,11 @@ function<bool(const SnarlTraversal&)> SnarlCaller::get_skip_allele_fn() const {
     return [](const SnarlTraversal&) { return false; };
 }
 
-SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager) :
+SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
+                                                 TraversalSupportFinder& support_finder) :
     graph(graph),
-    snarl_manager(snarl_manager) {
+    snarl_manager(snarl_manager),
+    support_finder(support_finder) {
 }
 
 SupportBasedSnarlCaller::~SupportBasedSnarlCaller() {
@@ -57,10 +59,10 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
 #endif
 
     // get the traversal sizes
-    vector<int> traversal_sizes = get_traversal_sizes(traversals);
+    vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
 
     // get the supports of each traversal independently
-    vector<Support> supports = get_traversal_set_support(traversals, {}, false, false, ref_trav_idx);
+    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, false, false, ref_trav_idx);
     int best_allele = get_best_support(supports, {});
 
 #ifdef debug
@@ -75,7 +77,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
 
     // we prune out traversals whose exclusive support (structure that is not shared with best traversal)
     // doesn't meet a certain cutoff
-    vector<Support> secondary_exclusive_supports = get_traversal_set_support(traversals, {best_allele}, true, false, ref_trav_idx);    
+    vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, ref_trav_idx);    
     vector<int> skips = {best_allele};
     for (int i = 0; i < secondary_exclusive_supports.size(); ++i) {
         double bias = get_bias(traversal_sizes, i, best_allele, ref_trav_idx);
@@ -88,7 +90,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
         }
     }
     // get the supports of each traversal in light of best
-    vector<Support> secondary_supports = get_traversal_set_support(traversals, {best_allele}, false, false, ref_trav_idx);
+    vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, ref_trav_idx);
     int second_best_allele = get_best_support(secondary_supports, {skips});
 
     // get the supports of each traversal in light of second best
@@ -97,7 +99,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
     int third_best_allele = -1;
     if (second_best_allele != -1) {
         // prune out traversals whose exclusive support relative to second best doesn't pass cut
-        vector<Support> tertiary_exclusive_supports = get_traversal_set_support(traversals, {second_best_allele}, true, false, ref_trav_idx);
+        vector<Support> tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, ref_trav_idx);
         skips.push_back(best_allele);
         skips.push_back(second_best_allele);
         for (int i = 0; i < tertiary_exclusive_supports.size(); ++i) {
@@ -106,7 +108,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
                 skips.push_back(i);
             }
         }
-        tertiary_supports = get_traversal_set_support(traversals, {second_best_allele}, false, false, ref_trav_idx);
+        tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, ref_trav_idx);
         third_best_allele = get_best_support(tertiary_supports, skips);
     }
 
@@ -253,11 +255,11 @@ void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl,
         shared_travs.push_back(genotype[0]);
     }
     // compute the support of our called alleles
-    vector<Support> allele_supports = get_traversal_set_support(traversals, shared_travs, false, false, 0);
+    vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, 0);
 
     // get the support of our uncalled alleles, making sure to not include any called support
     // TODO: handle shared support within this set
-    vector<Support> uncalled_supports = get_traversal_set_support(traversals, genotype, false, true, 0);
+    vector<Support> uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, 0);
         
     // Set up the depth format field
     variant.format.push_back("DP");
@@ -366,221 +368,6 @@ void SupportBasedSnarlCaller::update_vcf_header(string& header) const {
         std::to_string(min_site_depth) + "\">\n";    
 }
 
-function<bool(const SnarlTraversal&)> SupportBasedSnarlCaller::get_skip_allele_fn() const {
-    // port over cutoff used in old support caller (there avg support used all the time, here
-    // we use the same toggles as when genotyping)
-    return [&](const SnarlTraversal& trav) -> bool {
-        return support_val(get_traversal_support(trav)) < min_alt_path_support;
-    };
-}
-
-int64_t SupportBasedSnarlCaller::get_edge_length(const edge_t& edge, const unordered_map<id_t, size_t>& ref_offsets) const {
-    int len = -1;
-    // use our reference traversal to try to come up with a deletion length for our edge
-    // idea: if our edge corresponds to a huge deltion, it should be weighted accordingly
-    auto s_it = ref_offsets.find(graph.get_id(edge.first));
-    auto e_it = ref_offsets.find(graph.get_id(edge.second));
-    if (s_it != ref_offsets.end() && e_it != ref_offsets.end()) {
-        size_t start_offset = s_it->second;
-        if (!graph.get_is_reverse(edge.first)) {
-            start_offset += graph.get_length(edge.first);
-        }
-        size_t end_offset = e_it->second;
-        if (graph.get_is_reverse(edge.second)) {
-            end_offset += graph.get_length(edge.second);
-        }
-        if (start_offset > end_offset) {
-            std::swap(start_offset, end_offset);
-        }
-        len = end_offset - start_offset;
-    }
-    return std::max(len, 1);
-}
-
-tuple<Support, Support, int> SupportBasedSnarlCaller::get_child_support(const Snarl& snarl) const {
-    // port over old functionality from support caller
-    // todo: do we need to flag nodes as covered like it does?
-    pair<unordered_set<id_t>, unordered_set<edge_t> > contents = snarl_manager.deep_contents(&snarl, graph, true);
-    Support child_max_support;
-    Support child_total_support;
-    size_t child_size = 0;
-    for (id_t node_id : contents.first) {
-        Support child_support = get_avg_node_support(node_id);
-        child_max_support = support_max(child_max_support, child_support);
-        child_size += graph.get_length(graph.get_handle(node_id));
-        child_total_support += child_support;
-    }
-    Support child_avg_support = child_total_support / child_size;
-    // we always use child_max like the old support_caller.
-    // this is the only way to get top-down recursion to work in many cases
-    // todo: fix to use bottom up, get get support from actual traversals
-    // every time!! 
-    return std::tie(child_max_support, child_max_support, child_size);
-}
-
-
-Support SupportBasedSnarlCaller::get_traversal_support(const SnarlTraversal& traversal) const {
-    return get_traversal_set_support({traversal}, {}, false, false).at(0);
-}
-
-vector<Support> SupportBasedSnarlCaller::get_traversal_set_support(const vector<SnarlTraversal>& traversals,
-                                                                   const vector<int>& shared_travs,
-                                                                   bool exclusive_only,
-                                                                   bool exclusive_count,
-                                                                   int ref_trav_idx) const {
-
-    // pass 1: how many times have we seen a node or edge
-    unordered_map<id_t, int> node_counts;
-    unordered_map<edge_t, int> edge_counts;
-    map<Snarl, int> child_counts;
-
-    for (auto trav_idx : shared_travs) {
-        const SnarlTraversal& trav = traversals[trav_idx];
-        for (int i = 0; i < trav.visit_size(); ++i) {
-            const Visit& visit = trav.visit(i);
-            if (visit.node_id() != 0) {
-                // Count the node once
-                if (node_counts.count(visit.node_id())) {
-                    node_counts[visit.node_id()] += 1;
-                } else {
-                    node_counts[visit.node_id()] = 1;
-                }
-            } else {
-                // Count the child once
-                if (child_counts.count(visit.snarl())) {
-                    child_counts[visit.snarl()] += 1;
-                } else {
-                    child_counts[visit.snarl()] = 1;
-                }
-            }
-            // note: there is no edge between adjacent snarls as they overlap
-            // on their endpoints. 
-            if (i > 0 && (trav.visit(i - 1).node_id() != 0 || trav.visit(i).node_id() != 0)) {
-                edge_t edge = to_edge(graph, trav.visit(i - 1), visit);
-                // Count the edge once
-                if (edge_counts.count(edge)) {
-                    edge_counts[edge] += 1;
-                } else {
-                    edge_counts[edge] = 1;
-                }
-            }
-        }
-    }
-
-    // pass 1.5: get index for looking up deletion edge lengths (so far we aren't dependent
-    // on having anything but a path handle graph, so we index on the fly)
-    unordered_map<id_t, size_t> ref_offsets;
-    if (ref_trav_idx >= 0) {
-        ref_offsets = get_ref_offsets(traversals[ref_trav_idx]);
-    }
-
-    // pass 2: get the supports
-    // we compute the various combinations of min/avg node/trav supports as we don't know which
-    // we will need until all the sizes are known
-    Support max_support;
-    max_support.set_forward(numeric_limits<int>::max());
-    vector<Support> min_supports_min(traversals.size(), max_support); // use min node support
-    vector<Support> min_supports_avg(traversals.size(), max_support); // use avg node support
-    vector<bool> has_support(traversals.size(), false);
-    vector<Support> tot_supports_min(traversals.size()); // weighted by lengths, using min node support
-    vector<Support> tot_supports_avg(traversals.size()); // weighted by lengths, using avg node support
-    vector<int> tot_sizes(traversals.size(), 0); // to compute average from to_supports;
-    vector<int> tot_sizes_all(traversals.size(), 0); // as above, but includes excluded lengths
-    int max_trav_size = 0; // size of longest traversal
-
-    bool count_end_nodes = false; // toggle to include snarl ends
-
-    auto update_support = [&] (int trav_idx, const Support& min_support,
-                               const Support& avg_support, int length, int share_count) {
-        // keep track of overall size of longest traversal
-        tot_sizes_all[trav_idx] += length;
-        max_trav_size = std::max(tot_sizes_all[trav_idx], max_trav_size);
-
-        // apply the scaling
-        double scale_factor = ((exclusive_only || exclusive_count) && share_count > 0) ? 0. : 1. / (1. + share_count);
-        
-        // when looking at exclusive support, we don't normalize by skipped lengths
-        if (scale_factor != 0 || !exclusive_only || exclusive_count) {
-            has_support[trav_idx] = true;
-            Support scaled_support_min = min_support * scale_factor;
-            Support scaled_support_avg = avg_support * scale_factor;
-
-            tot_supports_min[trav_idx] += scaled_support_min;
-            tot_supports_avg[trav_idx] += scaled_support_avg * length;
-            tot_sizes[trav_idx] += length;
-            min_supports_min[trav_idx] = support_min(min_supports_min[trav_idx], scaled_support_min);
-            min_supports_avg[trav_idx] = support_min(min_supports_avg[trav_idx], scaled_support_avg * length);
-        }
-    };
-
-    for (int trav_idx = 0; trav_idx < traversals.size(); ++trav_idx) {
-        const SnarlTraversal& trav = traversals[trav_idx];
-        for (int visit_idx = 0; visit_idx < trav.visit_size(); ++visit_idx) {
-            const Visit& visit = trav.visit(visit_idx);
-            Support min_support;
-            Support avg_support;
-            int64_t length;
-            int share_count = 0;
-
-            if (visit.node_id() != 0) {
-                // get the node support
-                min_support = get_min_node_support(visit.node_id());
-                avg_support = get_avg_node_support(visit.node_id());
-                length = graph.get_length(graph.get_handle(visit.node_id()));
-                if (node_counts.count(visit.node_id())) {
-                    share_count = node_counts[visit.node_id()];
-                }
-            } else {
-                // get the child support
-                tie(min_support, avg_support, length) = get_child_support(visit.snarl());
-                if (child_counts.count(visit.snarl())) {
-                    share_count = child_counts[visit.snarl()];
-                }
-            }
-            if (count_end_nodes || (visit_idx > 0 && visit_idx < trav.visit_size() - 1)) {
-                update_support(trav_idx, min_support, avg_support, length, share_count);
-            }
-            share_count = 0;
-            
-            if (visit_idx > 0 && (trav.visit(visit_idx - 1).node_id() != 0 || trav.visit(visit_idx).node_id() != 0)) {
-                // get the edge support
-                edge_t edge = to_edge(graph, trav.visit(visit_idx - 1), visit);
-                min_support = get_edge_support(edge);
-                length = get_edge_length(edge, ref_offsets);
-                if (edge_counts.count(edge)) {
-                    share_count = edge_counts[edge];
-                }
-                update_support(trav_idx, min_support, min_support, length, share_count);
-            }
-        }
-    }
-
-    // correct for case where no exclusive support found
-    for (int i = 0; i < min_supports_min.size(); ++i) {
-        if (!has_support[i]) {
-            min_supports_min[i] = Support();
-            min_supports_avg[i] = Support();
-        }
-    }
-
-    bool use_avg_trav_support = max_trav_size >= average_traversal_support_switch_threshold;
-    bool use_avg_node_support = max_trav_size >= average_node_support_switch_threshold;
-
-    if (use_avg_trav_support) {
-        vector<Support>& tot_supports = use_avg_node_support ? tot_supports_avg : tot_supports_min;
-        for (int i = 0; i < tot_supports.size(); ++i) {
-            if (tot_sizes[i] > 0) {
-                tot_supports[i] /= (double)tot_sizes[i];
-            } else {
-                tot_supports[i] = Support();
-            }
-        }
-        return tot_supports;
-    } else {
-        return use_avg_node_support ? min_supports_avg : min_supports_min;
-    }
-}   
-
 int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, const vector<int>& skips) {
     int best_allele = -1;
     for(size_t i = 0; i < supports.size(); i++) {
@@ -592,35 +379,24 @@ int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, c
     return best_allele;
 }
 
-vector<int> SupportBasedSnarlCaller::get_traversal_sizes(const vector<SnarlTraversal>& traversals) const {
-    vector<int> sizes(traversals.size(), 0);
-    for (int i = 0; i < traversals.size(); ++i) {
-        for (int j = 0; j < traversals[i].visit_size(); ++j) {
-            if (traversals[i].visit(j).node_id() != 0) {
-                sizes[i] += graph.get_length(graph.get_handle(traversals[i].visit(j).node_id()));
-            } else {
-                // just summing up the snarl contents, which isn't a great heuristic but will
-                // help in some cases
-                pair<unordered_set<id_t>, unordered_set<edge_t> > contents = snarl_manager.deep_contents(
-                    snarl_manager.into_which_snarl(traversals[i].visit(j)), graph, true);
-                for (id_t node_id : contents.first) {
-                    sizes[i] += graph.get_length(graph.get_handle(node_id));
-                }
-            }
-        }
-    }
-    return sizes;
-    
+function<bool(const SnarlTraversal&)> SupportBasedSnarlCaller::get_skip_allele_fn() const {
+    // port over cutoff used in old support caller (there avg support used all the time, here
+    // we use the same toggles as when genotyping)
+    return [&](const SnarlTraversal& trav) -> bool {
+        return support_val(support_finder.get_traversal_support(trav)) < min_alt_path_support;
+    };
 }
 
-size_t SupportBasedSnarlCaller::get_average_traversal_support_switch_threshold() const {
-    return average_traversal_support_switch_threshold;
-}
 
 int SupportBasedSnarlCaller::get_min_total_support_for_call() const {
     return min_total_support_for_call;
 }
 
+const TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const {
+    return support_finder;
+}
+
+
 double SupportBasedSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
                                          int second_best_trav, int ref_trav_idx) const {
     bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) ||
@@ -653,76 +429,7 @@ double SupportBasedSnarlCaller::get_bias(const vector<int>& traversal_sizes, int
     return bias_limit;
 }
 
-unordered_map<id_t, size_t> SupportBasedSnarlCaller::get_ref_offsets(const SnarlTraversal& ref_trav) const {
-    unordered_map<id_t, size_t> ref_offsets;
-    size_t offset = 0;
-    for (int i = 0; i < ref_trav.visit_size(); ++i) {
-        const Visit& visit = ref_trav.visit(i);
-        if (visit.node_id() != 0) {
-            if (visit.backward()) {
-                offset += graph.get_length(graph.get_handle(visit.node_id()));
-                ref_offsets[visit.node_id()] = offset;
-            } else {
-                ref_offsets[visit.node_id()] = offset;
-                offset += graph.get_length(graph.get_handle(visit.node_id()));
-            }
-        }
-    }
-    return ref_offsets;
-}
-
-PackedSupportSnarlCaller::PackedSupportSnarlCaller(const Packer& packer, SnarlManager& snarl_manager) :
-    SupportBasedSnarlCaller(*dynamic_cast<const PathHandleGraph*>(packer.get_graph()), snarl_manager),
-    packer(packer) {
-}
-
-PackedSupportSnarlCaller::~PackedSupportSnarlCaller() {
-}
 
-Support PackedSupportSnarlCaller::get_edge_support(const edge_t& edge) const {
-    return get_edge_support(graph.get_id(edge.first), graph.get_is_reverse(edge.first),
-                            graph.get_id(edge.second), graph.get_is_reverse(edge.second));
-}
-
-Support PackedSupportSnarlCaller::get_edge_support(id_t from, bool from_reverse,
-                                                   id_t to, bool to_reverse) const {
-    Edge proto_edge;
-    proto_edge.set_from(from);
-    proto_edge.set_from_start(from_reverse);
-    proto_edge.set_to(to);
-    proto_edge.set_to_end(to_reverse);
-    Support support;
-    support.set_forward(packer.edge_coverage(proto_edge));
-    return support;
-}
-
-Support PackedSupportSnarlCaller::get_min_node_support(id_t node) const {
-    Position pos;
-    pos.set_node_id(node);
-    size_t offset = packer.position_in_basis(pos);
-    size_t coverage = packer.coverage_at_position(offset);
-    size_t end_offset = offset + graph.get_length(graph.get_handle(node));
-    for (int i = offset + 1; i < end_offset; ++i) {
-        coverage = min(coverage, packer.coverage_at_position(i));
-    }
-    Support support;
-    support.set_forward(coverage);
-    return support;
-}
-
-Support PackedSupportSnarlCaller::get_avg_node_support(id_t node) const {
-    Position pos;
-    pos.set_node_id(node);
-    size_t offset = packer.position_in_basis(pos);
-    size_t coverage = 0;
-    size_t length = graph.get_length(graph.get_handle(node));
-    for (int i = 0; i < length; ++i) {
-        coverage += packer.coverage_at_position(offset + i);
-    }
-    Support support;
-    support.set_forward((double)coverage / (double)length);
-    return support;
-}
 
 
 }
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index 4c0edaf6e98..40b4bab6784 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -11,7 +11,7 @@
 #include "handle.hpp"
 #include "snarls.hpp"
 #include "genotypekit.hpp"
-#include "packer.hpp"
+#include "traversal_support.hpp"
 
 namespace vg {
 
@@ -51,29 +51,14 @@ class SnarlCaller {
  */ 
 class SupportBasedSnarlCaller : public SnarlCaller {
 public:
-    SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager);
+    SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
+                            TraversalSupportFinder& support_finder);
     virtual ~SupportBasedSnarlCaller();
 
     /// Set some of the parameters
     void set_het_bias(double het_bias, double ref_het_bias = 0.);
     void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support);
 
-    /// Support of an edge
-    virtual Support get_edge_support(const edge_t& edge) const = 0;
-    virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const = 0;
-
-    /// Effective length of an edge
-    virtual int64_t get_edge_length(const edge_t& edge, const unordered_map<id_t, size_t>& ref_offsets) const;
-
-    /// Minimum support of a node
-    virtual Support get_min_node_support(id_t node) const = 0;
-
-    /// Average support of a node
-    virtual Support get_avg_node_support(id_t node) const = 0;
-
-    /// Use node or edge support as proxy for child support (as was done in original calling code)
-    virtual tuple<Support, Support, int> get_child_support(const Snarl& snarl) const;
-
     /// Get the genotype of a site
     virtual vector<int> genotype(const Snarl& snarl,
                                  const vector<SnarlTraversal>& traversals,
@@ -93,31 +78,12 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// Use min_alt_path_support threshold as cutoff
     virtual function<bool(const SnarlTraversal&)> get_skip_allele_fn() const;
 
-    /// Get the support of a traversal
-    /// Child snarls are handled as in the old call code: their maximum support is used
-    virtual Support get_traversal_support(const SnarlTraversal& traversal) const;
-
-    /// Get the support of a set of traversals.  Any support overlapping traversals in shared_travs
-    /// will have their support split.  If exclusive_only is true, then any split support gets
-    /// rounded down to 0 (and ignored when computing mins or averages) .
-    /// exclusive_count is like exclusive only except shared traversals will be counted (as 0)
-    /// when doing average and min support
-    /// if the ref_trav_idx is given, it will be used for computing (deletion) edge lengths
-    virtual vector<Support> get_traversal_set_support(const vector<SnarlTraversal>& traversals,
-                                                      const vector<int>& shared_travs,
-                                                      bool exclusive_only,
-                                                      bool exclusive_count,
-                                                      int ref_trav_idx = -1) const;
-
-    /// Get the total length of all nodes in the traversal
-    virtual vector<int> get_traversal_sizes(const vector<SnarlTraversal>& traversals) const;
-
-    /// Get the average traversal support thresholdek
-    virtual size_t get_average_traversal_support_switch_threshold() const;
-
     /// Get the minimum total support for call
     virtual int get_min_total_support_for_call() const;
 
+    /// Get the traversal support finder
+    const TraversalSupportFinder& get_support_finder() const;
+
 protected:
 
     /// Get the best support out of a list of supports, ignoring skips
@@ -162,12 +128,6 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     size_t min_site_depth = 3;
     /// what's the min log likelihood for allele depth assignments to PASS?
     double min_ad_log_likelihood_for_filter = -9;
-    /// Use average instead of minimum support when determining a traversal's support
-    /// its node and edge supports.
-    size_t average_traversal_support_switch_threshold = 50;
-    /// Use average instead of minimum support when determining a node's support
-    /// its position supports.
-    size_t average_node_support_switch_threshold = 50;
     /// used only for pruning alleles in the VCFTraversalFinder:  minimum support
     /// of an allele's alt-path for it to be considered in the brute-force enumeration
     double min_alt_path_support = 0.2;
@@ -176,35 +136,10 @@ class SupportBasedSnarlCaller : public SnarlCaller {
 
     SnarlManager& snarl_manager;
 
-    // todo: background support
-
+    TraversalSupportFinder& support_finder;
     
 };
 
-/**
- * Get the read support from a Packer object
- */ 
-class PackedSupportSnarlCaller : public SupportBasedSnarlCaller {
-public:
-    PackedSupportSnarlCaller(const Packer& packer, SnarlManager& snarl_manager);
-    virtual ~PackedSupportSnarlCaller();
-
-    /// Support of an edge
-    virtual Support get_edge_support(const edge_t& edge) const;
-    virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const;
-
-    /// Minimum support of a node
-    virtual Support get_min_node_support(id_t node) const;
-
-    /// Average support of a node
-    virtual Support get_avg_node_support(id_t node) const;
-    
-protected:
-
-    /// Derive supports from this pack index
-    const Packer& packer;
-};
-
 // debug helpers
 inline string to_string(const HandleGraph& graph, handle_t handle) {
     return std::to_string(graph.get_id(handle)) + ":" + std::to_string(graph.get_is_reverse(handle));
diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index 82df0928571..c2ebac17dda 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -248,11 +248,16 @@ int main_call(int argc, char** argv) {
 
     // Make a Packed Support Caller
     unique_ptr<Packer> packer;
+    unique_ptr<TraversalSupportFinder> support_finder;
     if (!pack_filename.empty()) {        
         // Load our packed supports (they must have come from vg pack on graph)
         packer = unique_ptr<Packer>(new Packer(graph));
         packer->load_from_file(pack_filename);
-        PackedSupportSnarlCaller* packed_caller = new PackedSupportSnarlCaller(*packer, *snarl_manager);
+        // Make a packed traversal support finder
+        PackedTraversalSupportFinder* packed_support_finder = new PackedTraversalSupportFinder(*packer, *snarl_manager);
+        support_finder = unique_ptr<TraversalSupportFinder>(packed_support_finder);
+        // Make a support caller
+        SupportBasedSnarlCaller* packed_caller = new SupportBasedSnarlCaller(*graph, *snarl_manager, *packed_support_finder);
         if (het_bias >= 0) {
             packed_caller->set_het_bias(het_bias, ref_het_bias);
         }
@@ -263,7 +268,7 @@ int main_call(int argc, char** argv) {
     }
 
     if (!snarl_caller) {
-        cerr << "error [vg call]: pack file (-p) is required" << endl;
+        cerr << "error [vg call]: pack file (-k) is required" << endl;
         return 1;
     }
 
diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp
new file mode 100644
index 00000000000..9372a995a73
--- /dev/null
+++ b/src/traversal_support.cpp
@@ -0,0 +1,321 @@
+#include "traversal_support.hpp"
+#include "genotypekit.hpp"
+
+//#define debug
+
+namespace vg {
+
+TraversalSupportFinder::TraversalSupportFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager) :
+    graph(graph),
+    snarl_manager(snarl_manager) {
+}
+
+TraversalSupportFinder::~TraversalSupportFinder() {
+    
+}
+
+int64_t TraversalSupportFinder::get_edge_length(const edge_t& edge, const unordered_map<id_t, size_t>& ref_offsets) const {
+    int len = -1;
+    // use our reference traversal to try to come up with a deletion length for our edge
+    // idea: if our edge corresponds to a huge deltion, it should be weighted accordingly
+    auto s_it = ref_offsets.find(graph.get_id(edge.first));
+    auto e_it = ref_offsets.find(graph.get_id(edge.second));
+    if (s_it != ref_offsets.end() && e_it != ref_offsets.end()) {
+        size_t start_offset = s_it->second;
+        if (!graph.get_is_reverse(edge.first)) {
+            start_offset += graph.get_length(edge.first);
+        }
+        size_t end_offset = e_it->second;
+        if (graph.get_is_reverse(edge.second)) {
+            end_offset += graph.get_length(edge.second);
+        }
+        if (start_offset > end_offset) {
+            std::swap(start_offset, end_offset);
+        }
+        len = end_offset - start_offset;
+    }
+    return std::max(len, 1);
+}
+
+tuple<Support, Support, int> TraversalSupportFinder::get_child_support(const Snarl& snarl) const {
+    // port over old functionality from support caller
+    // todo: do we need to flag nodes as covered like it does?
+    pair<unordered_set<id_t>, unordered_set<edge_t> > contents = snarl_manager.deep_contents(&snarl, graph, true);
+    Support child_max_support;
+    Support child_total_support;
+    size_t child_size = 0;
+    for (id_t node_id : contents.first) {
+        Support child_support = get_avg_node_support(node_id);
+        child_max_support = support_max(child_max_support, child_support);
+        child_size += graph.get_length(graph.get_handle(node_id));
+        child_total_support += child_support;
+    }
+    Support child_avg_support = child_total_support / child_size;
+    // we always use child_max like the old support_caller.
+    // this is the only way to get top-down recursion to work in many cases
+    // todo: fix to use bottom up, get get support from actual traversals
+    // every time!! 
+    return std::tie(child_max_support, child_max_support, child_size);
+}
+
+
+Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& traversal) const {
+    return get_traversal_set_support({traversal}, {}, false, false).at(0);
+}
+
+vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<SnarlTraversal>& traversals,
+                                                                   const vector<int>& shared_travs,
+                                                                   bool exclusive_only,
+                                                                   bool exclusive_count,
+                                                                   int ref_trav_idx) const {
+
+    // pass 1: how many times have we seen a node or edge
+    unordered_map<id_t, int> node_counts;
+    unordered_map<edge_t, int> edge_counts;
+    map<Snarl, int> child_counts;
+
+    for (auto trav_idx : shared_travs) {
+        const SnarlTraversal& trav = traversals[trav_idx];
+        for (int i = 0; i < trav.visit_size(); ++i) {
+            const Visit& visit = trav.visit(i);
+            if (visit.node_id() != 0) {
+                // Count the node once
+                if (node_counts.count(visit.node_id())) {
+                    node_counts[visit.node_id()] += 1;
+                } else {
+                    node_counts[visit.node_id()] = 1;
+                }
+            } else {
+                // Count the child once
+                if (child_counts.count(visit.snarl())) {
+                    child_counts[visit.snarl()] += 1;
+                } else {
+                    child_counts[visit.snarl()] = 1;
+                }
+            }
+            // note: there is no edge between adjacent snarls as they overlap
+            // on their endpoints. 
+            if (i > 0 && (trav.visit(i - 1).node_id() != 0 || trav.visit(i).node_id() != 0)) {
+                edge_t edge = to_edge(graph, trav.visit(i - 1), visit);
+                // Count the edge once
+                if (edge_counts.count(edge)) {
+                    edge_counts[edge] += 1;
+                } else {
+                    edge_counts[edge] = 1;
+                }
+            }
+        }
+    }
+
+    // pass 1.5: get index for looking up deletion edge lengths (so far we aren't dependent
+    // on having anything but a path handle graph, so we index on the fly)
+    unordered_map<id_t, size_t> ref_offsets;
+    if (ref_trav_idx >= 0) {
+        ref_offsets = get_ref_offsets(traversals[ref_trav_idx]);
+    }
+
+    // pass 2: get the supports
+    // we compute the various combinations of min/avg node/trav supports as we don't know which
+    // we will need until all the sizes are known
+    Support max_support;
+    max_support.set_forward(numeric_limits<int>::max());
+    vector<Support> min_supports_min(traversals.size(), max_support); // use min node support
+    vector<Support> min_supports_avg(traversals.size(), max_support); // use avg node support
+    vector<bool> has_support(traversals.size(), false);
+    vector<Support> tot_supports_min(traversals.size()); // weighted by lengths, using min node support
+    vector<Support> tot_supports_avg(traversals.size()); // weighted by lengths, using avg node support
+    vector<int> tot_sizes(traversals.size(), 0); // to compute average from to_supports;
+    vector<int> tot_sizes_all(traversals.size(), 0); // as above, but includes excluded lengths
+    int max_trav_size = 0; // size of longest traversal
+
+    bool count_end_nodes = false; // toggle to include snarl ends
+
+    auto update_support = [&] (int trav_idx, const Support& min_support,
+                               const Support& avg_support, int length, int share_count) {
+        // keep track of overall size of longest traversal
+        tot_sizes_all[trav_idx] += length;
+        max_trav_size = std::max(tot_sizes_all[trav_idx], max_trav_size);
+
+        // apply the scaling
+        double scale_factor = ((exclusive_only || exclusive_count) && share_count > 0) ? 0. : 1. / (1. + share_count);
+        
+        // when looking at exclusive support, we don't normalize by skipped lengths
+        if (scale_factor != 0 || !exclusive_only || exclusive_count) {
+            has_support[trav_idx] = true;
+            Support scaled_support_min = min_support * scale_factor;
+            Support scaled_support_avg = avg_support * scale_factor;
+
+            tot_supports_min[trav_idx] += scaled_support_min;
+            tot_supports_avg[trav_idx] += scaled_support_avg * length;
+            tot_sizes[trav_idx] += length;
+            min_supports_min[trav_idx] = support_min(min_supports_min[trav_idx], scaled_support_min);
+            min_supports_avg[trav_idx] = support_min(min_supports_avg[trav_idx], scaled_support_avg * length);
+        }
+    };
+
+    for (int trav_idx = 0; trav_idx < traversals.size(); ++trav_idx) {
+        const SnarlTraversal& trav = traversals[trav_idx];
+        for (int visit_idx = 0; visit_idx < trav.visit_size(); ++visit_idx) {
+            const Visit& visit = trav.visit(visit_idx);
+            Support min_support;
+            Support avg_support;
+            int64_t length;
+            int share_count = 0;
+
+            if (visit.node_id() != 0) {
+                // get the node support
+                min_support = get_min_node_support(visit.node_id());
+                avg_support = get_avg_node_support(visit.node_id());
+                length = graph.get_length(graph.get_handle(visit.node_id()));
+                if (node_counts.count(visit.node_id())) {
+                    share_count = node_counts[visit.node_id()];
+                }
+            } else {
+                // get the child support
+                tie(min_support, avg_support, length) = get_child_support(visit.snarl());
+                if (child_counts.count(visit.snarl())) {
+                    share_count = child_counts[visit.snarl()];
+                }
+            }
+            if (count_end_nodes || (visit_idx > 0 && visit_idx < trav.visit_size() - 1)) {
+                update_support(trav_idx, min_support, avg_support, length, share_count);
+            }
+            share_count = 0;
+            
+            if (visit_idx > 0 && (trav.visit(visit_idx - 1).node_id() != 0 || trav.visit(visit_idx).node_id() != 0)) {
+                // get the edge support
+                edge_t edge = to_edge(graph, trav.visit(visit_idx - 1), visit);
+                min_support = get_edge_support(edge);
+                length = get_edge_length(edge, ref_offsets);
+                if (edge_counts.count(edge)) {
+                    share_count = edge_counts[edge];
+                }
+                update_support(trav_idx, min_support, min_support, length, share_count);
+            }
+        }
+    }
+
+    // correct for case where no exclusive support found
+    for (int i = 0; i < min_supports_min.size(); ++i) {
+        if (!has_support[i]) {
+            min_supports_min[i] = Support();
+            min_supports_avg[i] = Support();
+        }
+    }
+
+    bool use_avg_trav_support = max_trav_size >= average_traversal_support_switch_threshold;
+    bool use_avg_node_support = max_trav_size >= average_node_support_switch_threshold;
+
+    if (use_avg_trav_support) {
+        vector<Support>& tot_supports = use_avg_node_support ? tot_supports_avg : tot_supports_min;
+        for (int i = 0; i < tot_supports.size(); ++i) {
+            if (tot_sizes[i] > 0) {
+                tot_supports[i] /= (double)tot_sizes[i];
+            } else {
+                tot_supports[i] = Support();
+            }
+        }
+        return tot_supports;
+    } else {
+        return use_avg_node_support ? min_supports_avg : min_supports_min;
+    }
+}   
+
+vector<int> TraversalSupportFinder::get_traversal_sizes(const vector<SnarlTraversal>& traversals) const {
+    vector<int> sizes(traversals.size(), 0);
+    for (int i = 0; i < traversals.size(); ++i) {
+        for (int j = 0; j < traversals[i].visit_size(); ++j) {
+            if (traversals[i].visit(j).node_id() != 0) {
+                sizes[i] += graph.get_length(graph.get_handle(traversals[i].visit(j).node_id()));
+            } else {
+                // just summing up the snarl contents, which isn't a great heuristic but will
+                // help in some cases
+                pair<unordered_set<id_t>, unordered_set<edge_t> > contents = snarl_manager.deep_contents(
+                    snarl_manager.into_which_snarl(traversals[i].visit(j)), graph, true);
+                for (id_t node_id : contents.first) {
+                    sizes[i] += graph.get_length(graph.get_handle(node_id));
+                }
+            }
+        }
+    }
+    return sizes;
+    
+}
+
+size_t TraversalSupportFinder::get_average_traversal_support_switch_threshold() const {
+    return average_traversal_support_switch_threshold;
+}
+
+unordered_map<id_t, size_t> TraversalSupportFinder::get_ref_offsets(const SnarlTraversal& ref_trav) const {
+    unordered_map<id_t, size_t> ref_offsets;
+    size_t offset = 0;
+    for (int i = 0; i < ref_trav.visit_size(); ++i) {
+        const Visit& visit = ref_trav.visit(i);
+        if (visit.node_id() != 0) {
+            if (visit.backward()) {
+                offset += graph.get_length(graph.get_handle(visit.node_id()));
+                ref_offsets[visit.node_id()] = offset;
+            } else {
+                ref_offsets[visit.node_id()] = offset;
+                offset += graph.get_length(graph.get_handle(visit.node_id()));
+            }
+        }
+    }
+    return ref_offsets;
+}
+
+PackedTraversalSupportFinder::PackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager) :
+    TraversalSupportFinder(*dynamic_cast<const PathHandleGraph*>(packer.get_graph()), snarl_manager),
+    packer(packer) {
+}
+
+PackedTraversalSupportFinder::~PackedTraversalSupportFinder() {
+}
+
+Support PackedTraversalSupportFinder::get_edge_support(const edge_t& edge) const {
+    return get_edge_support(graph.get_id(edge.first), graph.get_is_reverse(edge.first),
+                            graph.get_id(edge.second), graph.get_is_reverse(edge.second));
+}
+
+Support PackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse,
+                                                   id_t to, bool to_reverse) const {
+    Edge proto_edge;
+    proto_edge.set_from(from);
+    proto_edge.set_from_start(from_reverse);
+    proto_edge.set_to(to);
+    proto_edge.set_to_end(to_reverse);
+    Support support;
+    support.set_forward(packer.edge_coverage(proto_edge));
+    return support;
+}
+
+Support PackedTraversalSupportFinder::get_min_node_support(id_t node) const {
+    Position pos;
+    pos.set_node_id(node);
+    size_t offset = packer.position_in_basis(pos);
+    size_t coverage = packer.coverage_at_position(offset);
+    size_t end_offset = offset + graph.get_length(graph.get_handle(node));
+    for (int i = offset + 1; i < end_offset; ++i) {
+        coverage = min(coverage, packer.coverage_at_position(i));
+    }
+    Support support;
+    support.set_forward(coverage);
+    return support;
+}
+
+Support PackedTraversalSupportFinder::get_avg_node_support(id_t node) const {
+    Position pos;
+    pos.set_node_id(node);
+    size_t offset = packer.position_in_basis(pos);
+    size_t coverage = 0;
+    size_t length = graph.get_length(graph.get_handle(node));
+    for (int i = 0; i < length; ++i) {
+        coverage += packer.coverage_at_position(offset + i);
+    }
+    Support support;
+    support.set_forward((double)coverage / (double)length);
+    return support;
+}
+
+
+}
diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp
new file mode 100644
index 00000000000..1b910ed3a3c
--- /dev/null
+++ b/src/traversal_support.hpp
@@ -0,0 +1,113 @@
+#ifndef VG_SUPPORT_FINDER_HPP_INCLUDED
+#define VG_SUPPORT_FINDER_HPP_INCLUDED
+
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <cmath>
+#include <limits>
+#include <unordered_set>
+#include <tuple>
+#include "handle.hpp"
+#include "snarls.hpp"
+#include "genotypekit.hpp"
+#include "packer.hpp"
+
+namespace vg {
+
+using namespace std;
+
+
+/**
+ * Get the read support of snarl traversals or sets of snarl traversals
+ */ 
+class TraversalSupportFinder {
+public:
+    TraversalSupportFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager);
+    virtual ~TraversalSupportFinder();
+
+    /// Support of an edge
+    virtual Support get_edge_support(const edge_t& edge) const = 0;
+    virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const = 0;
+
+    /// Effective length of an edge
+    virtual int64_t get_edge_length(const edge_t& edge, const unordered_map<id_t, size_t>& ref_offsets) const;
+
+    /// Minimum support of a node
+    virtual Support get_min_node_support(id_t node) const = 0;
+
+    /// Average support of a node
+    virtual Support get_avg_node_support(id_t node) const = 0;
+
+    /// Use node or edge support as proxy for child support (as was done in original calling code)
+    virtual tuple<Support, Support, int> get_child_support(const Snarl& snarl) const;
+
+    /// Get the support of a traversal
+    /// Child snarls are handled as in the old call code: their maximum support is used
+    virtual Support get_traversal_support(const SnarlTraversal& traversal) const;
+
+    /// Get the support of a set of traversals.  Any support overlapping traversals in shared_travs
+    /// will have their support split.  If exclusive_only is true, then any split support gets
+    /// rounded down to 0 (and ignored when computing mins or averages) .
+    /// exclusive_count is like exclusive only except shared traversals will be counted (as 0)
+    /// when doing average and min support
+    /// if the ref_trav_idx is given, it will be used for computing (deletion) edge lengths
+    virtual vector<Support> get_traversal_set_support(const vector<SnarlTraversal>& traversals,
+                                                      const vector<int>& shared_travs,
+                                                      bool exclusive_only,
+                                                      bool exclusive_count,
+                                                      int ref_trav_idx = -1) const;
+
+    /// Get the total length of all nodes in the traversal
+    virtual vector<int> get_traversal_sizes(const vector<SnarlTraversal>& traversals) const;
+
+    /// Get the average traversal support thresholdek
+    virtual size_t get_average_traversal_support_switch_threshold() const;
+
+    /// Relic from old code
+    static double support_val(const Support& support) { return total(support); };
+
+    /// get a map of the beginning of a node (in forward orientation) on a traversal
+    /// used for up-weighting large deletion edges in complex snarls with average support
+    unordered_map<id_t, size_t> get_ref_offsets(const SnarlTraversal& ref_trav) const;
+
+protected:
+
+    size_t average_traversal_support_switch_threshold = 50;
+    /// Use average instead of minimum support when determining a node's support
+    /// its position supports.
+    size_t average_node_support_switch_threshold = 50;
+
+    const PathHandleGraph& graph;
+
+    SnarlManager& snarl_manager;
+
+};
+
+/**
+ * Get the read support from a Packer object
+ */ 
+class PackedTraversalSupportFinder : public TraversalSupportFinder {
+public:
+    PackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager);
+    virtual ~PackedTraversalSupportFinder();
+
+    /// Support of an edge
+    virtual Support get_edge_support(const edge_t& edge) const;
+    virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const;
+
+    /// Minimum support of a node
+    virtual Support get_min_node_support(id_t node) const;
+
+    /// Average support of a node
+    virtual Support get_avg_node_support(id_t node) const;
+    
+protected:
+
+    /// Derive supports from this pack index
+    const Packer& packer;
+};
+
+}
+
+#endif

From c4bdd4508879491ca56c724570a23cbf0cbf0c90 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 5 Nov 2019 17:36:25 -0500
Subject: [PATCH 25/79] move tmpfile init to packer construction to avoid race
 condition

---
 src/packer.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/packer.cpp b/src/packer.cpp
index 6f14e79dd46..c3182661776 100644
--- a/src/packer.cpp
+++ b/src/packer.cpp
@@ -68,7 +68,11 @@ Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins,
     if (bin_size) {
         n_bins = num_bases_dynamic / bin_size + 1;
     }
-    tmpfstream_locks = new std::mutex[n_bins];
+    if (record_edits) { 
+        tmpfstream_locks = new std::mutex[n_bins];
+        // open tmpfile if needed
+        ensure_edit_tmpfiles_open();
+    }
 
     // speed up quality computation if necessary
     for (size_t i = 0; i < get_thread_count(); ++i) {
@@ -385,8 +389,6 @@ void Packer::add(const Alignment& aln, int min_mapq, int min_baseq , bool qual_a
     if (aln.mapping_quality() < min_mapq) {
         return;
     }
-    // open tmpfile if needed
-    ensure_edit_tmpfiles_open();
     // count the nodes, edges, and edits
     Mapping prev_mapping;
     bool has_prev_mapping = false;

From 781b38b1237c89c24c8509924b99e2b78df87c84 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 6 Nov 2019 09:01:14 -0500
Subject: [PATCH 26/79] forgotten init

---
 src/packer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/packer.cpp b/src/packer.cpp
index c3182661776..7b7055a2e2f 100644
--- a/src/packer.cpp
+++ b/src/packer.cpp
@@ -63,6 +63,7 @@ Packer::Packer(const HandleGraph* graph, size_t bin_size, size_t coverage_bins,
     // mutexes for coverage
     base_locks = new std::mutex[coverage_dynamic.size()];
     edge_locks = new std::mutex[edge_coverage_dynamic.size()];
+    tmpfstream_locks = nullptr;
     
     // count the bins if binning
     if (bin_size) {

From 3fcca3c26e70504a82e359dd7b1b94a521114702 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Wed, 6 Nov 2019 17:20:14 -0800
Subject: [PATCH 27/79] Made things compile

---
 src/minimizer_mapper.cpp        |    2 +-
 src/seed_clusterer.cpp          | 1053 +++++++++++++++++--------------
 src/seed_clusterer.hpp          |   58 +-
 src/subcommand/cluster_main.cpp |    3 +-
 src/unittest/seed_clusterer.cpp |  385 +++++------
 5 files changed, 816 insertions(+), 685 deletions(-)

diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp
index 5a028a137d1..8b11b1f461d 100644
--- a/src/minimizer_mapper.cpp
+++ b/src/minimizer_mapper.cpp
@@ -187,7 +187,7 @@ void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) {
     }
         
     // Cluster the seeds. Get sets of input seed indexes that go together.
-    vector<vector<size_t>> clusters paired_clusters = clusterer.cluster_seeds(seeds, distance_limit);
+    vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, distance_limit);
     
     if (track_provenance) {
         funnel.substage("score");
diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index c05d75efbd7..ede84c05f36 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -2,7 +2,7 @@
 
 #include <algorithm>
 
-//#define DEBUG
+#define DEBUG_CLUSTER
 
 namespace vg {
 
@@ -10,22 +10,22 @@ namespace vg {
                                             dist_index(dist_index){
     };
 
-    vector<vector<size_t>> cluster_seeds (vector<pos_t> seeds, int64_t read_distance_limit) const {
+    SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (vector<pos_t> seeds, int64_t read_distance_limit) const {
         vector<vector<pos_t>> all_seeds;
         all_seeds.push_back(std::move(seeds));
-        tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> all_clusters = 
-            cluster_seeds(all_seeds, distance_limit);
+        tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> all_clusters =
+            cluster_seeds(all_seeds, read_distance_limit, 0);
         return std::get<0>(all_clusters)[0];
     };
 
-    tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> SnarlSeedClusterer::cluster_seeds (
+    tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds (
                   vector<vector<pos_t>> all_seeds, int64_t read_distance_limit,
                   int64_t fragment_distance_limit) const {
         /* Given a vector of seeds and a limit, find a clustering of seeds where
          * seeds that are closer than the limit cluster together.
          * Returns a vector of cluster assignments
          */
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
 cerr << endl << "New cluster calculation:" << endl;
 #endif
         if (fragment_distance_limit != 0 &&
@@ -45,7 +45,9 @@ cerr << endl << "New cluster calculation:" << endl;
         //This stores all the tree relationships and cluster information
         //for a single level of the snarl tree as it is being processed
         //It also keeps track of the parents of the current level
-        TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit);
+        size_t seed_count = 0;
+        for (auto& v : all_seeds) seed_count+= v.size();
+        TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit, seed_count);
 
         //Populate tree_state.node_to_seeds (mapping each node to the seeds it
         //contains) and snarl_to_nodes_by_level
@@ -85,23 +87,38 @@ cerr << endl << "New cluster calculation:" << endl;
             tree_state.chain_to_snarls.clear();
         }
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
         cerr << "Found read clusters : " << endl;
-        for (auto group : tree_state.read_union_find.all_groups()){
-            for (size_t c : group) {
-               cerr << tree_state.seeds->at(c) << " ";
+        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+            cerr << "\t read num " << read_num << ": " ;
+            for (auto group : tree_state.read_union_find[read_num].all_groups()){
+                cerr << "\t\t";
+                for (size_t c : group) {
+                   cerr << tree_state.all_seeds->at(read_num)[c] << " ";
+                }
+                cerr << endl;
+            }
+        }
+        vector<pos_t> ordered_seeds;
+        for (size_t i = 0 ; i < tree_state.all_seeds->size() ; i++) {
+            for (auto x : tree_state.all_seeds->at(i)) {
+                ordered_seeds.push_back(x);
             }
-            cerr << endl;
         }
         cerr << "Found fragment clusters : " << endl;
         for (auto group : tree_state.fragment_union_find.all_groups()){
+            cerr << "\t";
             for (size_t c : group) {
-               cerr << tree_state.seeds->at(c) << " ";
+               cerr << ordered_seeds[c] << " ";
             }
             cerr << endl;
         }
 #endif
-        return make_tuple(tree_state.read_union_find.all_groups(),
+        vector<vector<vector<size_t>>> read_clusters;
+        for (auto& uf : tree_state.read_union_find) {
+            read_clusters.emplace_back(uf.all_groups());
+        }
+        return make_tuple(std::move(read_clusters),
                           tree_state.fragment_union_find.all_groups());
 
     };
@@ -116,27 +133,29 @@ cerr << endl << "New cluster calculation:" << endl;
                                                                snarl_to_nodes) const {
 
         // Assign each seed to a node.
-        tree_state.node_to_seeds.reserve(tree_state.all_seeds->size());
-        for (size_t i = 0; i < tree_state.seeds->size(); i++) {
-            for (size_t j = 0 ; j < tree_state.all_seeds[i].size() ; j++) {
-                id_t id = get_id(tree_state.all_seeds->at(i)->at(j));
-                tree_state.node_to_seeds.emplace_back(id, i, j);
+        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ 
+            vector<pos_t>& seeds = tree_state.all_seeds->at(read_num);
+            for (size_t i = 0; i < seeds.size(); i++) {
+                id_t id = get_id(seeds.at(i));
+                tree_state.node_to_seeds[read_num].emplace_back(id, i);
                 //For each seed, assign it to a node and the node to a snarl
             }
+            std::sort(tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end());
         }
-        std::sort(tree_state.node_to_seeds.begin(), tree_state.node_to_seeds.end());
 
         // Assign each node to a snarl.
         id_t prev_node = -1;
-        for (auto mapping : tree_state.node_to_seeds) {
-            if (get<0>(mapping) == prev_node) {
-                continue;
+        for (auto& read_node :tree_state.node_to_seeds) {
+            for (auto& mapping : read_node) {
+                if (mapping.first == prev_node) {
+                    continue;
+                }
+                prev_node = mapping.first;
+                size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first);
+                size_t depth = dist_index.snarl_indexes[snarl_i].depth;
+                snarl_to_nodes[depth][snarl_i].emplace_back(
+                         NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size()));
             }
-            prev_node = mapping.first;
-            size_t snarl_i = dist_index.getPrimaryAssignment(get<0>(mapping));
-            size_t depth = dist_index.snarl_indexes[snarl_i].depth;
-            snarl_to_nodes[depth][snarl_i].emplace_back(
-                     NetgraphNode(get<0>(mapping), NODE), NodeClusters());
         }
     }
 
@@ -153,7 +172,7 @@ cerr << endl << "New cluster calculation:" << endl;
             MinimumDistanceIndex::SnarlIndex& snarl_index =
                                      dist_index.snarl_indexes[snarl_i];
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
             cerr << "At depth " << depth << " snarl number " << snarl_i
                 << " headed by " << snarl_index.id_in_parent
                 << " with children " << endl;
@@ -175,7 +194,7 @@ cerr << endl << "New cluster calculation:" << endl;
                         chain_rank, make_pair(snarl_i,
                             cluster_one_snarl(tree_state, snarl_i)));
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
                 cerr << "Recording snarl number " << snarl_i << " headed by "
                       << snarl_index.id_in_parent  << " as a child of chain number "
                       << chain_assignment << " headed by " << snarl_index.parent_id << endl;
@@ -187,7 +206,7 @@ cerr << endl << "New cluster calculation:" << endl;
 
                 if (depth != 0 && snarl_index.parent_id != 0){
                     //If this has a parent, record it
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
                     assert(snarl_index.parent_id >= dist_index.min_node_id);
                     assert(snarl_index.parent_id <= dist_index.max_node_id);
 #endif
@@ -199,7 +218,7 @@ cerr << endl << "New cluster calculation:" << endl;
                             NetgraphNode (snarl_i, SNARL),
                             cluster_one_snarl(tree_state, snarl_i));
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
                     cerr << "Recording snarl number " << snarl_i
                         << " headed by " << snarl_index.id_in_parent
                         << " as a child of snarl number " << parent_snarl_i
@@ -225,7 +244,7 @@ cerr << endl << "New cluster calculation:" << endl;
             // Get the chain's number
             size_t chain_i = kv.first;
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
             cerr << "At depth " << depth << " chain number " << chain_i
                  << " with children " << endl;
             for (auto it2 : kv.second) {
@@ -242,7 +261,7 @@ cerr << endl << "New cluster calculation:" << endl;
                 // Find the node ID that heads the parent of that chain.
                 size_t parent_id = dist_index.chain_indexes[chain_i].parent_id;
                 // It must be a legitimate node ID we cover.
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
                 assert(parent_id >= dist_index.min_node_id);
                 assert(parent_id <= dist_index.max_node_id);
 #endif
@@ -254,7 +273,7 @@ cerr << endl << "New cluster calculation:" << endl;
 
                 tree_state.parent_snarl_to_nodes[parent_snarl_i].emplace_back(
                       NetgraphNode (chain_i, CHAIN), std::move(chain_clusters));
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
                 cerr << "Recording chain number " << chain_i << " headed by "
                      << dist_index.chain_indexes[chain_i].id_in_parent
                     << " as a child of snarl number " << parent_snarl_i
@@ -269,168 +288,216 @@ cerr << endl << "New cluster calculation:" << endl;
     SnarlSeedClusterer::NodeClusters SnarlSeedClusterer::cluster_one_node(
                        TreeState& tree_state,
                        id_t node_id, int64_t node_length) const {
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
         cerr << "Finding clusters on node " << node_id << " which has length " <<
         node_length << endl;
 #endif
         /*Find clusters of seeds in this node.
-         * Returns a hash_set of the union find group IDs of the new clusters,
+         * Result contains hash_set of the union find group IDs of the new clusters,
          * and the shortest distance from any seed to the left and right sides
          * of the node*/
 
-        auto seed_range_start = std::lower_bound(
-            tree_state.node_to_seeds.begin(),
-            tree_state.node_to_seeds.end(),
-            std::tuple<id_t, size_t, size_t>(node_id, 0, 0));
-
         //indices of union find group ids of clusters in this node
-        NodeClusters node_clusters;
+        NodeClusters node_clusters(tree_state.all_seeds->size());
 
         if (tree_state.read_distance_limit > node_length) {
             //If the limit is greater than the node length, then all the
             //seeds on this node must be in the same cluster
 
-            size_t group_id = seed_range_start->second;
+            for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+                if (tree_state.node_to_seeds[read_num].size() > 0) {
+                    auto seed_range_start = std::lower_bound(
+                        tree_state.node_to_seeds[read_num].begin(),
+                        tree_state.node_to_seeds[read_num].end(),
+                        std::pair<id_t, size_t>(node_id, 0));
+
+                    size_t group_id = seed_range_start->second;
+                    size_t fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
+
+                    for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
+                        //For each seed on this node, add it to the cluster
+                        //And find the shortest distance from any seed to both
+                        //ends of the node
+
+                        pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
+                        int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed)
+                                                         : get_offset(seed) + 1;
+                        int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1
+                                                       : node_length - get_offset(seed);
+
+                        node_clusters.read_best_left[read_num] = min_not_minus_one(dist_left,
+                                                              node_clusters.read_best_left[read_num]);
+                        node_clusters.read_best_right[read_num] = min_not_minus_one(dist_right,
+                                                              node_clusters.read_best_right[read_num]);
+                        node_clusters.fragment_best_left = min_not_minus_one(dist_left,
+                                                              node_clusters.fragment_best_left);
+                        node_clusters.fragment_best_right = min_not_minus_one(dist_right,
+                                                              node_clusters.fragment_best_right);
+
+                        tree_state.read_union_find[read_num].union_groups(group_id, iter->second);
+                        if (tree_state.fragment_distance_limit != 0 ) {
+                            tree_state.fragment_union_find.union_groups(fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]);
+                        }
 
-            for (auto iter = seed_range_start; iter != tree_state.node_to_seeds.end() && iter->first == node_id; ++iter) {
-                //For each seed on this node, add it to the cluster
-                //And find the shortest distance from any seed to both
-                //ends of the node
+                    }
 
-                pos_t seed = tree_state.all_seeds->at(std::get<1>(*iter)).at(std::get<2>(*iter));
-                int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed)
-                                                 : get_offset(seed) + 1;
-                int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1
-                                               : node_length - get_offset(seed);
+                    //Record the new cluster
+                    group_id = tree_state.read_union_find[read_num].find_group(group_id);
+                    tree_state.read_cluster_dists[read_num][group_id] = make_pair(node_clusters.read_best_left[read_num],
+                                                        node_clusters.read_best_right[read_num]);
+                    node_clusters.read_cluster_heads.emplace(read_num, group_id);
 
-                node_clusters.best_left = min_not_minus_one(dist_left,
-                                                      node_clusters.best_left);
-                node_clusters.best_right = min_not_minus_one(dist_right,
-                                                      node_clusters.best_right);
+                    if (tree_state.fragment_distance_limit != 0) {
+                        fragment_group_id = tree_state.fragment_union_find.find_group(fragment_group_id);
+                        tree_state.fragment_cluster_dists[fragment_group_id] = make_pair(node_clusters.fragment_best_left,
+                                                            node_clusters.fragment_best_right);
+                    }
+#ifdef DEBUG_CLUSTER
+                    assert (group_id == tree_state.read_union_find[read_num].find_group(group_id));
+                    cerr << "Found single cluster on node " << node_id << "with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl;
+                    bool got_left = false;
+                    bool got_right = false;
+                    for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
+                        pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+                        assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]);
+                        assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]);
+                        assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left);
+                        assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right);
+                        if (dists.first == node_clusters.fragment_best_left) {got_left = true;}
+                        if (dists.second == node_clusters.fragment_best_right) {got_right = true;}
+                        //if (dists.first == node_clusters.read_best_left[read_num]) {got_all_left[read_num] = true;}
+                        //if (dists.second == node_clusters.read_best_right[read_num]) {got_all_right[read_num] = true;}
+                        cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " <<
+                                                                   dists.second << endl;
+                    }
+                    assert(got_left);
+                    assert(got_right);
 
-                tree_state.read_union_find.union_groups(group_id, iter->second);
-                if (tree_state.fragment_distance_limit != 0) {
-                    tree_state.fragment_union_find.union_groups(group_id, iter->second);
+#endif
                 }
-
-            }
-
-            //Record the new cluster
-            group_id = tree_state.read_union_find.find_group(group_id);
-            tree_state.read_cluster_dists[group_id] = make_pair(node_clusters.best_left,
-                                                node_clusters.best_right);
-            node_clusters.cluster_heads.insert(group_id);
-#ifdef DEBUG
-            assert (group_id == tree_state.read_union_find.find_group(group_id));
-            cerr << "Found single cluster on node " << node_id << endl;
-            bool got_left = false;
-            bool got_right = false;
-            for (size_t c : node_clusters.cluster_heads) {
-                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c];
-                assert(dists.first == -1 || dists.first >= node_clusters.best_left);
-                assert(dists.second == -1 || dists.second >= node_clusters.best_right);
-                if (dists.first == node_clusters.best_left) {got_left = true;}
-                if (dists.second == node_clusters.best_right) {got_right = true;}
-                cerr << "\t" << c << ": left: " << dists.first << " right : " <<
-                                                           dists.second << endl;
             }
-            assert(got_left);
-            assert(got_right);
-#endif
             return node_clusters;
         }
 
-        //Create a vector of seeds with their offsets
-        vector<pair<size_t, int64_t>> seed_offsets;
-        for (auto iter = seed_range_start; iter != tree_state.node_to_seeds.end() && iter->first == node_id; ++iter) {
-            //For each seed, find its offset
-            pos_t seed = tree_state.seeds->at(iter->second);
-            int64_t offset = is_rev(seed) ? node_length - get_offset(seed)
-                                            : get_offset(seed) + 1;
-
-            node_clusters.best_left = min_not_minus_one(offset,
-                                                   node_clusters.best_left);
-            node_clusters.best_right = min_not_minus_one(node_length-offset+1,
-                                                    node_clusters.best_right);
-
-            seed_offsets.emplace_back(iter->second, offset);
+        vector<tuple<size_t,size_t, int64_t>> seed_offsets;
+        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+            //<index of read, index of seed, offset of seed> for all seeds
+            auto seed_range_start = std::lower_bound(
+                tree_state.node_to_seeds[read_num].begin(),
+                tree_state.node_to_seeds[read_num].end(),
+                std::pair<id_t, size_t>(node_id, 0));
+            for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
+                //For each seed, find its offset
+                pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
+                int64_t offset = is_rev(seed) ? node_length - get_offset(seed)
+                                                : get_offset(seed) + 1;
+
+                node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left);
+                node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right);
+                node_clusters.read_best_left[read_num] = min_not_minus_one(offset, node_clusters.read_best_left[read_num]);
+                node_clusters.read_best_right[read_num] = min_not_minus_one(node_length-offset+1, node_clusters.read_best_right[read_num]);
+
+                seed_offsets.emplace_back(read_num, iter->second, offset);
 
+            }
         }
         //Sort seeds by their position in the node
         std::sort(seed_offsets.begin(), seed_offsets.end(),
                      [&](const auto a, const auto b) -> bool {
-                          return a.second < b.second;
+                          return  std::get<2>(a) < std::get<2>(b);
                       } );
 
-        int64_t last_offset = 0; int64_t read_last_left = -1;
-        size_t read_last_cluster = seed_offsets[0].first;
-        int64_t fragment_last_left = -1;
-        size_t fragment_last_cluster = seed_offsets[0].first;
-        node_clusters.cluster_heads.insert(read_last_cluster);
+        vector<int64_t> read_last_offset (tree_state.all_seeds->size(), -1);
+        int64_t fragment_last_offset = -1;
+        size_t fragment_last_cluster = -1;
+        vector<size_t> read_last_cluster (tree_state.all_seeds->size(), -1);
 
-        for ( pair<size_t, int64_t> s : seed_offsets) {
+        for ( tuple<size_t, size_t, int64_t> s : seed_offsets) {
             //For each seed, in order of position in the node,
             //see if it belongs to a new read/fragment cluster - if it is
             //close enough to the previous seed
+            size_t read_num = std::get<0>(s);
 
-            if (read_last_left != -1 &&
-                abs(s.second - last_offset) <= tree_state.read_distance_limit) {
+            if (read_last_offset[read_num] != -1 &&
+                abs(std::get<2>(s) - read_last_offset[read_num]) <= tree_state.read_distance_limit) {
+                //TODO: Need abs?
                 //If this seed is in the same read cluster as the previous one,
                 //union them
 
-                tree_state.read_union_find.union_groups(s.first, read_last_cluster);
-                read_last_cluster = tree_state.read_union_find.find_group(s.first);
-                tree_state.read_cluster_dists[read_last_cluster] = make_pair(read_last_left, node_length-s.second+1);
+                int64_t prev_dist_left = tree_state.read_cluster_dists[read_num][read_last_cluster[read_num]].first;
+                tree_state.read_union_find[read_num].union_groups(std::get<1>(s), read_last_cluster[read_num]);
+                read_last_cluster[read_num] = tree_state.read_union_find[read_num].find_group(std::get<1>(s));
+                tree_state.read_cluster_dists[read_num][read_last_cluster[read_num]] = 
+                                       make_pair(prev_dist_left,node_length-std::get<2>(s)+1);
+                read_last_offset[read_num] = std::get<2>(s);
 
                 if (tree_state.fragment_distance_limit != 0) {
                     //If we are also clustering paired end reads by fragment distance,
                     //cluster these together
-                    tree_state.fragment_union_find.union_groups(s.first, fragment_last_cluster);
-                    fragment_last_cluster = tree_state.fragment_union_find.find_group(s.first);
+                    int64_t prev_dist_left = tree_state.fragment_cluster_dists[fragment_last_cluster].first;
+                    tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster);
+                    fragment_last_cluster = tree_state.fragment_union_find.find_group(std::get<1>(s)+tree_state.read_index_offsets[read_num]);
+                    tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1);
+                    fragment_last_offset = std::get<2>(s);
                 }
             } else {
-                //This becomes a new cluster
-                node_clusters.cluster_heads.insert(s.first);
-                read_last_cluster = s.first;
-                read_last_left = s.second;
-                tree_state.read_cluster_dists[s.first] = make_pair(read_last_left, node_length - s.second + 1);
+                //This becomes a new read cluster
+                if (read_last_cluster[read_num] != -1) {
+                    node_clusters.read_cluster_heads.emplace(read_num, read_last_cluster[read_num]);
+                }
+                read_last_cluster[read_num] = std::get<1>(s);
+                read_last_offset[read_num] = std::get<2>(s);
+                tree_state.read_cluster_dists[read_num][read_last_cluster[read_num]] = 
+                        make_pair(read_last_offset[read_num], node_length - read_last_offset[read_num] + 1);
                 if (tree_state.fragment_distance_limit != 0) {
-                    if (read_last_left != -1 &&
-                        abs(s.second - last_offset) <= tree_state.fragment_distance_limit) {
+                    if (fragment_last_offset != -1 &&
+                        abs(read_last_offset[read_num] - fragment_last_offset) <= tree_state.fragment_distance_limit) {
                         //If this is a new read cluster but the same fragment cluster
-                        tree_state.fragment_union_find.union_groups(s.first, fragment_last_cluster);
-                        fragment_last_cluster = tree_state.fragment_union_find.find_group(s.first);
+                        int64_t prev_dist_left = tree_state.fragment_cluster_dists[fragment_last_cluster].first;
+                        tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster);
+                        fragment_last_cluster = tree_state.fragment_union_find.find_group(fragment_last_cluster);
+                        tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1);
 
                     } else {
                         //If this is a new fragment cluster as well
-                        fragment_last_cluster = s.first;
-                        fragment_last_left = s.second;
+                        fragment_last_cluster = std::get<1>(s)+tree_state.read_index_offsets[read_num];
+                        fragment_last_offset = std::get<2>(s);
+                        tree_state.fragment_cluster_dists[fragment_last_cluster] = 
+                                make_pair(fragment_last_offset, node_length-fragment_last_offset+1);
                     }
                 }
             }
-            last_offset = s.second;
-
+        }
+        for (size_t i = 0 ; i < read_last_cluster.size() ; i++) {
+            node_clusters.read_cluster_heads.emplace(i, read_last_cluster[i]);
         }
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
+
         cerr << "Found read clusters on node " << node_id << endl;
         bool got_left = false;
         bool got_right = false;
-        for (size_t c : node_clusters.cluster_heads) {
-            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c];
-            assert(dists.first == -1 || dists.first >= node_clusters.best_left);
-            assert(dists.first == -1 || dists.second >= node_clusters.best_right);
-            if (dists.first == node_clusters.best_left) {got_left = true;}
-            if (dists.second == node_clusters.best_right) {got_right = true;}
-            cerr << "\t" << c << ": left: " << dists.first << " right : "
-                 << dists.second << endl;
+
+        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+            for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
+                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+                assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]);
+                assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]);
+                assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left);
+                assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right);
+                if (dists.first == node_clusters.fragment_best_left) {got_left = true;}
+                if (dists.second == node_clusters.fragment_best_right) {got_right = true;}
+                cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " <<
+                                                           dists.second << endl;
+            }
         }
         assert(got_left );
         assert(got_right);
-        for (size_t group_id : node_clusters.cluster_heads) {
-            assert (group_id == tree_state.read_union_find.find_group(group_id));
+        for (pair<size_t, size_t> group_id : node_clusters.read_cluster_heads) {
+            assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second));
         }
 #endif
+        
         return node_clusters;
 
     };
@@ -448,51 +515,48 @@ cerr << endl << "New cluster calculation:" << endl;
 
         MinimumDistanceIndex::ChainIndex& chain_index = dist_index.chain_indexes[
                                                             chain_index_i];
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
         cerr << "Finding clusters on chain number " << chain_index_i
              << " headed by node " << chain_index.id_in_parent << endl;
 #endif
 
         auto combine_snarl_clusters = [&] (size_t& new_group,
                         size_t& combined_group, size_t& fragment_combined_group,
-                        vector<size_t>& to_erase, int64_t dist,
-                        pair<int64_t, int64_t>& dists){
+                        vector<pair<size_t,size_t>>& to_erase, int64_t fragment_dist,int64_t read_dist,
+                        pair<int64_t, int64_t>& dists, size_t read_num){
             //Helper function to combine clusters of the same snarl
             //Used when two clusters in the same snarl can be combined by
             //looping in the chain
 
-            if (dist <= tree_state.read_distance_limit) {
+            if (read_dist <= tree_state.read_distance_limit) {
                 if (combined_group == -1) {
                     combined_group = new_group;
                 } else {
                     //Union the two groups
-                    combined_group = tree_state.read_union_find.find_group(
-                                                     combined_group);
-                    tree_state.read_union_find.union_groups(combined_group,
-                                                                new_group);
+                    combined_group = tree_state.read_union_find[read_num].find_group(combined_group);
+                    tree_state.read_union_find[read_num].union_groups(combined_group, new_group);
                     //Find the new distances of the combined groups
                     pair<int64_t, int64_t>& old_dists =
-                                           tree_state.read_cluster_dists[combined_group];
-                    size_t new_combined_group =
-                               tree_state.read_union_find.find_group(new_group);
+                                           tree_state.read_cluster_dists[read_num][combined_group];
+                    size_t new_combined_group = tree_state.read_union_find[read_num].find_group(new_group);
                     //Update which groups are being kept track of
                     if (new_combined_group != new_group) {
-                        to_erase.push_back(new_group);
+                        to_erase.emplace_back(read_num, new_group);
                     }
                     if (new_combined_group != combined_group)  {
-                        to_erase.push_back(combined_group);
+                        to_erase.emplace_back(read_num, combined_group);
                     }
                     combined_group = new_combined_group;
 
                     dists = make_pair(
                           min_not_minus_one(old_dists.first, dists.first),
                           min_not_minus_one(old_dists.second, dists.second));
-                    tree_state.read_cluster_dists[new_group] = dists;
-                    tree_state.read_cluster_dists[combined_group] = dists;
-#ifdef DEBUG
-                    cerr << " New dists: "
-                         << tree_state.read_cluster_dists[combined_group].first << " "
-                         << tree_state.read_cluster_dists[combined_group].second << endl;
+                    tree_state.read_cluster_dists[read_num][new_group] = dists;
+                    tree_state.read_cluster_dists[read_num][combined_group] = dists;
+#ifdef DEBUG_CLUSTER
+                    cerr << " New dists for read num " << read_num << ": "
+                         << tree_state.read_cluster_dists[read_num][combined_group].first << " "
+                         << tree_state.read_cluster_dists[read_num][combined_group].second << endl;
 #endif
                 }
 
@@ -500,12 +564,13 @@ cerr << endl << "New cluster calculation:" << endl;
                     if (fragment_combined_group != -1) {
                     //If we're keeping track of fragment clusters, union this
                         tree_state.fragment_union_find.union_groups(fragment_combined_group,
-                                                            new_group);
+                                                            new_group + tree_state.read_index_offsets[read_num]);
                     }
-                    fragment_combined_group = tree_state.fragment_union_find.find_group(new_group);
+                    fragment_combined_group = tree_state.fragment_union_find.find_group(
+                                                         new_group + tree_state.read_index_offsets[read_num]);
                 }
             } else if (tree_state.fragment_distance_limit != 0 &&
-                        dist <= tree_state.fragment_distance_limit) {
+                        fragment_dist <= tree_state.fragment_distance_limit) {
                 //If these aren't in the same read cluster but are in
                 //the same fragment cluster
                 if (fragment_combined_group == -1) {
@@ -519,7 +584,7 @@ cerr << endl << "New cluster calculation:" << endl;
         };
         //The clusters of the chain that are built from the snarl clusters
         //This will get updated as we traverse through the snarls
-        NodeClusters chain_clusters;
+        NodeClusters chain_clusters(tree_state.all_seeds->size());
 
         //The rank of the node at which the chain clusters reach
         // (the last snarl that was traversed)
@@ -572,13 +637,17 @@ cerr << endl << "New cluster calculation:" << endl;
                          make_pair(start_rank, false), last_len, start_length);
                 offset = offset - last_len + start_length;
 
-                for (size_t i : chain_clusters.cluster_heads) {
-                    tree_state.read_cluster_dists[i].second =
-                                tree_state.read_cluster_dists[i].second == -1
-                               ? -1 : tree_state.read_cluster_dists[i].second + offset;
+                for (pair<size_t, size_t> c : chain_clusters.read_cluster_heads) {
+                    tree_state.read_cluster_dists[c.first][c.second].second =
+                            tree_state.read_cluster_dists[c.first][c.second].second == -1
+                            ? -1 : tree_state.read_cluster_dists[c.first][c.second].second + offset;
+                }
+                chain_clusters.fragment_best_right = chain_clusters.fragment_best_right == -1 ? -1
+                                            : chain_clusters.fragment_best_right + offset;
+                for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+                    chain_clusters.read_best_right[read_num] = chain_clusters.read_best_right[read_num] == -1 ? -1
+                                            : chain_clusters.read_best_right[read_num] + offset;
                 }
-                chain_clusters.best_right = chain_clusters.best_right == -1 ? -1
-                                            : chain_clusters.best_right + offset;
             }
 
             last_rank = start_rank + 1;
@@ -596,19 +665,19 @@ cerr << endl << "New cluster calculation:" << endl;
             int64_t loop_dist_start = chain_index.loop_rev[start_rank] - 1;
 
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
             cerr << "Looking at snarl rank " << start_rank << " representing " << snarl_index.id_in_parent << endl;
-            cerr << "  Snarl distance limits: " << snarl_clusters.best_left
-                 << " " << snarl_clusters.best_right << endl;
+            cerr << "  Snarl fragment distance limits: " << snarl_clusters.fragment_best_left
+                 << " " << snarl_clusters.fragment_best_right << endl;
             cerr << "  Snarl clusters to add: " << endl;
-            for (size_t c : snarl_clusters.cluster_heads) {
-                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c];
-                cerr << "\tleft: " << dists.first << " right : " << dists.second
+            for (pair<size_t, size_t> c : snarl_clusters.read_cluster_heads) {
+                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+                cerr << "\tread " << c.first << ",cluster " << c.second << " left: " << dists.first << " right : " << dists.second
                      << endl;
                 cerr << "\t\t";
-                for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) {
-                    if (tree_state.read_union_find.find_group(x) == c) {
-                        cerr << tree_state.seeds->at(x) << " ";
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                    if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
                     }
                 }
                 cerr << endl;
@@ -617,16 +686,16 @@ cerr << endl << "New cluster calculation:" << endl;
 
             cerr << "  Clusters on chain: " << endl;
 
-            cerr << "  best left: " << chain_clusters.best_left << " best right: "
-                  << chain_clusters.best_right << endl;
-            for (size_t c : chain_clusters.cluster_heads) {
-                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c];
+            cerr << "  best left: " << chain_clusters.fragment_best_left << " best right: "
+                  << chain_clusters.fragment_best_right << endl;
+            for (pair<size_t, size_t> c : chain_clusters.read_cluster_heads) {
+                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
                 cerr << "\tleft: " << dists.first << " right : " << dists.second
                      << endl;
                 cerr << "\t\t";
-                for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) {
-                    if (tree_state.read_union_find.find_group(x) == c) {
-                        cerr << tree_state.seeds->at(x) << " ";
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                    if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
                     }
                 }
                 cerr << endl;
@@ -637,31 +706,35 @@ cerr << endl << "New cluster calculation:" << endl;
 
             //Need to remember this to check if snarl clusters overlap the old
             //best distance
-            int64_t old_chain_right = chain_clusters.best_right;
+            int64_t fragment_chain_right = chain_clusters.fragment_best_right;
+            vector<int64_t> read_chain_right = std::move(chain_clusters.read_best_right);
 
-            vector<size_t> to_add;//new cluster group ids from snarl clusters
-            vector<size_t> to_erase; //old cluster group ids
+            vector<pair<size_t,size_t>> to_add;//new cluster group ids from snarl clusters
+            vector<pair<size_t,size_t>> to_erase; //old cluster group ids
             //New cluster- there will be at most one new cluster to add
-            size_t combined_cluster = -1;
+            vector< size_t> combined_cluster (tree_state.all_seeds->size(), -1);
             size_t fragment_combined_cluster = -1;
-            int64_t combined_left = -1; int64_t combined_right = -1;
+            vector<int64_t> combined_left (tree_state.all_seeds->size(), -1); 
+            vector<int64_t> combined_right (tree_state.all_seeds->size(), -1); 
 
             //Combined snarl clusters by taking chain loop left/right
-            size_t snarl_cluster_left = -1;
-            size_t snarl_cluster_right = -1;
+            vector<size_t> snarl_cluster_left (tree_state.all_seeds->size(),-1);
+            vector<size_t> snarl_cluster_right (tree_state.all_seeds->size(), -1);
             size_t fragment_snarl_cluster_left = -1;
             size_t fragment_snarl_cluster_right = -1;
 
-            chain_clusters.best_left = -1; chain_clusters.best_right = -1;
-            for (size_t j : snarl_clusters.cluster_heads) {
+            chain_clusters.fragment_best_right = -1;
+            chain_clusters.read_best_right.assign(tree_state.all_seeds->size(), -1);
+            for (pair<size_t, size_t> cluster_head : snarl_clusters.read_cluster_heads) {
                 // For each of the clusters for the current snarl,
                 // first check if it can be combined with any other
                 // snarl clusters by taking loops in the chain,
                 // then, find if it belongs to the new combined cluster
                 // that includes chain clusters
+                size_t read_num = cluster_head.first;
 
                 pair<int64_t, int64_t> snarl_dists =
-                                        std::move(tree_state.read_cluster_dists[j]);
+                                        std::move(tree_state.read_cluster_dists[read_num][cluster_head.second]);
 
                 if (loop_dist_start != -1) {
                     //If there is a loop going out and back into the start of
@@ -671,183 +744,177 @@ cerr << endl << "New cluster calculation:" << endl;
                     //The distance to the right side of the snarl
                     // that is found by taking the leftmost seed and
                     // looping through the chain to the left
-                    int64_t new_right =
-                              snarl_dists.first == -1 || loop_dist_start == -1
+                    int64_t new_right = snarl_dists.first == -1 || loop_dist_start == -1
                                         ? -1
-                                        : snarl_dists.first + loop_dist_start
-                                               + snarl_length - start_length;
-                    snarl_dists.second = min_not_minus_one(new_right,
-                                                      snarl_dists.second);
-                    snarl_clusters.best_right =min_not_minus_one(snarl_clusters.best_right,
-                                                            new_right);
-#ifdef DEBUG
-cerr << "  (Possibly) updating looping distance to right of snarl cluster " << j << ": "
+                                        : snarl_dists.first + loop_dist_start + snarl_length - start_length;
+                    snarl_dists.second = min_not_minus_one(new_right, snarl_dists.second);
+                    snarl_clusters.fragment_best_right =
+                               min_not_minus_one(snarl_clusters.fragment_best_right, new_right);
+                    snarl_clusters.read_best_right[read_num] =
+                               min_not_minus_one(snarl_clusters.read_best_right[read_num], new_right);
+#ifdef DEBUG_CLUSTER
+cerr << "  (Possibly) updating looping distance to right of snarl cluster " << read_num <<":" << cluster_head.second << ": "
      << new_right << " -> " << snarl_dists.second <<  endl;
 #endif
 
 
-                    if (snarl_clusters.best_left != -1 && snarl_dists.first != -1 ) {
+                    if (snarl_clusters.read_best_left[read_num] != -1 && snarl_dists.first != -1 ) {
                         //If this cluster can be combined with another cluster
                         //from the left
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
 cerr << "  Combining this cluster from the left " ;
 #endif
-                        combine_snarl_clusters(j, snarl_cluster_left, fragment_snarl_cluster_left,
-                                               to_erase, snarl_clusters.best_left + snarl_dists.first
-                                          + loop_dist_start - start_length - 1, snarl_dists);
+                        combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], fragment_snarl_cluster_left,
+                                     to_erase, snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1,
+                                     snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1,
+                                     snarl_dists, read_num);
                     }
 
                 }
 
                 if (loop_dist_end != -1) {
                     //If there is a loop to the right
-                    int64_t new_left =
-                        snarl_dists.second == -1 || loop_dist_end == -1
-                          ? -1
-                          : snarl_dists.second + loop_dist_end + snarl_length
-                                         - end_length;
-                    if (snarl_dists.first == -1 || (new_left != -1 &
-                                                   new_left < snarl_dists.first)){
+                    int64_t new_left = snarl_dists.second == -1 || loop_dist_end == -1
+                        ? -1
+                          : snarl_dists.second + loop_dist_end + snarl_length - end_length;
+                    if (snarl_dists.first == -1 || (new_left != -1 & new_left < snarl_dists.first)){
                         //If this is an improvement, update distances
                         snarl_dists.first = new_left;
-                        snarl_clusters.best_left = min_not_minus_one(new_left,
-                                                                snarl_clusters.best_left);
+                        snarl_clusters.read_best_left[read_num] = 
+                                min_not_minus_one(new_left, snarl_clusters.read_best_left[read_num]);
+                        snarl_clusters.fragment_best_left = min_not_minus_one(new_left, snarl_clusters.fragment_best_left);
 
-#ifdef DEBUG
-cerr << "Updating looping distance to left of snarl cluster" << j << ": "
+#ifdef DEBUG_CLUSTER
+cerr << "Updating looping distance to left of snarl cluster " << read_num << ":" << cluster_head.second << ": "
      << new_left << endl;
 #endif
                     }
 
-                    if (snarl_clusters.best_right != -1 && snarl_dists.second != -1 ) {
+                    if (snarl_clusters.read_best_right[read_num] != -1 && snarl_dists.second != -1 ) {
                         //If this cluster can be combined with another cluster
                         //from the right
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
 cerr << "  Combining this cluster from the right" << endl;
 #endif
-                        combine_snarl_clusters(j, snarl_cluster_right,
+                        combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num],
                              fragment_snarl_cluster_right, to_erase,
-                            snarl_clusters.best_right + snarl_dists.second
-                                + loop_dist_end - end_length - 1, snarl_dists);
+                            snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1,
+                            snarl_clusters.read_best_right[read_num] + snarl_dists.second  + loop_dist_end - end_length - 1,
+                            snarl_dists, read_num);
                     }
                 }
 
                 //Now check if this snarl cluster can be combined with any
                 //existing chain clusters
-                if (old_chain_right != -1 && snarl_dists.first != -1 &&
-                    snarl_dists.first + old_chain_right - start_length-1
+                if (read_chain_right[read_num] != -1 && snarl_dists.first != -1 &&
+                    snarl_dists.first + read_chain_right[read_num] - start_length-1
                                                 <= tree_state.read_distance_limit) {
                     //If this snarl cluster's leftmost seed is close enough to
                     //the rightmost seed in the chain (up to this point), then
                     //this snarl cluster is in the combined cluster
 
-                    if (combined_cluster == -1) {
-                        combined_cluster = j;
-                        combined_left = snarl_dists.first == -1 ? -1 :
+                    if (combined_cluster[read_num] == -1) {
+                        combined_cluster[read_num] = cluster_head.second;
+                        combined_left[read_num] = snarl_dists.first == -1 ? -1 :
                                             snarl_dists.first + add_dist_left;
-                        combined_right = snarl_dists.second;
+                        combined_right[read_num] = snarl_dists.second;
                     } else {
                         //Cluster
-                        tree_state.read_union_find.union_groups(combined_cluster, j);
-                        size_t new_group = tree_state.read_union_find.find_group(j);
-                        combined_cluster = new_group;
-                        combined_left = min_not_minus_one(combined_left,
-                                            snarl_dists.first == -1 ? -1 :
-                                            snarl_dists.first + add_dist_left);
-                        combined_right = min_not_minus_one(combined_right,
-                                                           snarl_dists.second);
+                        tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second);
+                        combined_cluster[read_num] = tree_state.read_union_find[read_num].find_group(cluster_head.second);
+                        combined_left[read_num] = min_not_minus_one(combined_left[read_num],
+                                    snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left);
+                        combined_right[read_num] = min_not_minus_one(combined_right[read_num],snarl_dists.second);
                     }
                     if (tree_state.fragment_distance_limit != 0) {
                         if (fragment_combined_cluster != -1) {
                             //Also cluster by fragment
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, j);
+                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, 
+                                                    cluster_head.second+tree_state.read_index_offsets[read_num]);
                         }
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(j);
+                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
                     }
                 } else {
                     //If the snarl cluster does not get combined with any of
-                    //the existing chain clusters, then it becomes a new
-                    //chain cluster
-                    if (tree_state.fragment_distance_limit != 0 &&
-                        old_chain_right != -1 && snarl_dists.first != -1 &&
-                        snarl_dists.first + old_chain_right - start_length-1
-                                                    <= tree_state.fragment_distance_limit) {
-                        //If this is a new read cluster but the same fragment cluster
-                        if (fragment_combined_cluster == -1 ) {
-                            fragment_combined_cluster = j;
-                        } else {
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, j);
-                            fragment_combined_cluster = tree_state.fragment_union_find.find_group(j);
+                    //the existing chain clusters, then it becomes a new chain cluster
+                    if (tree_state.fragment_distance_limit != 0 && fragment_chain_right != -1 && snarl_dists.first != -1 &&
+                           snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.read_distance_limit) {
+                        //Cluster in the same fragment but not the same read
+                        if (fragment_combined_cluster != -1) {
+                            //Also cluster by fragment
+                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, 
+                                                    cluster_head.second+tree_state.read_index_offsets[read_num]);
                         }
+                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
                     }
-                    to_add.push_back(j);
+                    to_add.push_back(cluster_head);
                     //Update its distances to the correct nodes in the chain
-                    pair<int64_t, int64_t> d = make_pair(snarl_dists.first == -1
-                                      ? -1 : snarl_dists.first + add_dist_left,
+                    pair<int64_t, int64_t> d = make_pair(snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left,
                                                 snarl_dists.second);
-                    chain_clusters.best_left = min_not_minus_one(chain_clusters.best_left,
+                    chain_clusters.fragment_best_left = min_not_minus_one(chain_clusters.fragment_best_left,d.first);
+                    chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right,d.second);
+                    chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num],
                                                             d.first);
-                    chain_clusters.best_right = min_not_minus_one(chain_clusters.best_right,
+                    chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num],
                                                              d.second);
 
-                    tree_state.read_cluster_dists[j] = std::move(d);
+                    tree_state.read_cluster_dists[read_num][cluster_head.second] = std::move(d);
                 }
             }
 
             //Next, go through each of the clusters of the chain and decide
             //if they get combined with snarl clusters
-            for (size_t i : chain_clusters.cluster_heads) {
+            for (pair<size_t, size_t> cluster_head : chain_clusters.read_cluster_heads) {
                 //For each old chain cluster
-                pair<int64_t, int64_t>& chain_dists = tree_state.read_cluster_dists[i];
+                size_t read_num = cluster_head.first;
+                pair<int64_t, int64_t>& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second];
 
-                if (snarl_clusters.best_left != -1 && chain_dists.second != -1
-                     && chain_dists.second + snarl_clusters.best_left
+                if (snarl_clusters.read_best_left[read_num] != -1 && chain_dists.second != -1
+                     && chain_dists.second + snarl_clusters.read_best_left[read_num]
                                 - start_length-1 <= tree_state.read_distance_limit){
                     //If this chain cluster's rightmost seed is close enough
                     //to the leftmost seed of any cluster in the snarl, then
                     //this chain cluster is in the combined cluster
 
-                    if (combined_cluster == -1) {
-                        combined_cluster = i;
-                        combined_left = chain_dists.first;
-                        combined_right = chain_dists.second + dist_to_end;
+                    if (combined_cluster[read_num] == -1) {
+                        //New chain cluster
+                        combined_cluster[read_num] = cluster_head.second;
+                        combined_left[read_num] = chain_dists.first;
+                        combined_right[read_num] = chain_dists.second + dist_to_end;
                     } else {
-                        tree_state.read_union_find.union_groups(combined_cluster, i);
-                        size_t new_group = tree_state.read_union_find.find_group(i);
-                        if (new_group == i) {
-                            to_erase.push_back(combined_cluster);
+                        //Combine
+                        tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second);
+                        size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second);
+                        if (new_group == cluster_head.second) {
+                            to_erase.emplace_back(read_num,combined_cluster[read_num]);
                         } else {
-                            to_erase.push_back(i);
+                            to_erase.push_back(cluster_head);
                         }
-                        combined_cluster = new_group;
-                        combined_left = min_not_minus_one(combined_left,
-                                                            chain_dists.first);
-                        combined_right = min_not_minus_one(combined_right,
-                                             chain_dists.second + dist_to_end);
+                        combined_cluster[read_num] = new_group;
+                        combined_left[read_num] = min_not_minus_one(combined_left[read_num], chain_dists.first);
+                        combined_right[read_num] = min_not_minus_one(combined_right[read_num], chain_dists.second + dist_to_end);
                     }
                     if (tree_state.fragment_distance_limit != 0) {
                         if (fragment_combined_cluster != -1) {
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i);
+                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]);
                         }
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(i);
+                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
                     }
                 } else {
                     //If this chain cluster is on its own, extend its right
                     //distance to the end of the current snarl
 
                     if (tree_state.fragment_distance_limit != 0 &&
-                        snarl_clusters.best_left != -1 && chain_dists.second != -1
-                        && chain_dists.second + snarl_clusters.best_left
+                        snarl_clusters.fragment_best_left != -1 && chain_dists.second != -1
+                        && chain_dists.second + snarl_clusters.fragment_best_left
                                 - start_length-1 <= tree_state.fragment_distance_limit) {
                         //If this is a new read cluster but the same fragment cluster
-                        if (fragment_combined_cluster == -1 ) {
-                            fragment_combined_cluster = i;
-                        } else {
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i);
-                            fragment_combined_cluster = tree_state.fragment_union_find.find_group(i);
+                        if (fragment_combined_cluster != -1) {
+                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]);
                         }
+                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
                     }
                     chain_dists.second += dist_to_end;
                     if ((tree_state.fragment_distance_limit == 0 &&
@@ -860,49 +927,51 @@ cerr << "  Combining this cluster from the right" << endl;
                         //either end of the chain is greater than the distance
                         //limit, then it cannot cluster with anything else
                         //so we can stop keeping track of it
-#ifdef DEBUG
-                        cerr << "Removing cluster " << i << endl;
+#ifdef DEBUG_CLUSTER
+                        cerr << "Removing cluster " << cluster_head.first << ":" << cluster_head.second << endl;
 #endif
-                        to_erase.push_back(i);
+                        to_erase.push_back(cluster_head);
                     } else {
-                        chain_clusters.best_left = min_not_minus_one(
-                                   chain_clusters.best_left, chain_dists.first);
-                        chain_clusters.best_right =
-                                    min_not_minus_one(chain_clusters.best_right,
-                                                  chain_dists.second);
+                        chain_clusters.fragment_best_left =  min_not_minus_one(chain_clusters.fragment_best_left, chain_dists.first);
+                        chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right, chain_dists.second);
+                        chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], chain_dists.first);
+                        chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], chain_dists.second);
                     }
                 }
             }
             //Update the chain cluster heads
-            for (size_t j : to_add) {
-                chain_clusters.cluster_heads.insert(j);
+            for (auto c : to_add) {
+                chain_clusters.read_cluster_heads.insert(c);
             }
-            for (size_t j : to_erase) {
-                chain_clusters.cluster_heads.erase(j);
+            for (auto c : to_erase) {
+                chain_clusters.read_cluster_heads.erase(c);
             }
-            if (combined_cluster != -1 ) {
-                chain_clusters.cluster_heads.insert(combined_cluster);
-                tree_state.read_cluster_dists[combined_cluster] =
-                                      make_pair(combined_left, combined_right);
-                chain_clusters.best_left = min_not_minus_one(chain_clusters.best_left,
-                                                        combined_left);
-                chain_clusters.best_right = min_not_minus_one(chain_clusters.best_right,
-                                                         combined_right);
+            for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+                if (combined_cluster[read_num] != -1 ) {
+                    chain_clusters.read_cluster_heads.emplace(read_num, combined_cluster[read_num]);
+                    tree_state.read_cluster_dists[read_num][combined_cluster[read_num]] =
+                                          make_pair(combined_left[read_num], combined_right[read_num]);
+                    chain_clusters.fragment_best_left = min_not_minus_one(chain_clusters.fragment_best_left, combined_left[read_num]);
+                    chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right, combined_right[read_num]);
+                    chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], combined_left[read_num]);
+                    chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], combined_right[read_num]);
+
+                }
             }
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
             cerr << "\t finished with snarl " << snarl_index.id_in_parent
-                 << "with best distances " << chain_clusters.best_left
-                 << " " << chain_clusters.best_right
+                 << "with best distances " << chain_clusters.fragment_best_left
+                 << " " << chain_clusters.fragment_best_right
                  << ", clusters:" <<endl;
 
-            for (size_t c : chain_clusters.cluster_heads) {
-                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c];
+            for (pair<size_t,size_t> c : chain_clusters.read_cluster_heads) {
+                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
                 cerr << "\t\tleft: " << dists.first << " right : " << dists.second << endl;
                 cerr << "\t\t\t";
-                for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) {
-                    if (tree_state.read_union_find.find_group(x) == c) {
-                        cerr << tree_state.seeds->at(x) << " ";
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                    if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
                     }
                 }
                 cerr << endl;
@@ -916,17 +985,17 @@ cerr << "  Combining this cluster from the right" << endl;
         if (last_rank != chain_index.prefix_sum.size() - 2) {
             //If the last snarl we traversed was not the end of the chain,
             //Extend the right bound of each cluster to the end of the chain
-            chain_clusters.best_right = -1;
-            int64_t last_dist = last_rank == 0 ? 0 :
-                                     chain_index.prefix_sum[last_rank] - 1;
-            int64_t dist_to_end = chain_index.chainLength()
-                        - last_dist - last_len;
-            for (size_t i : chain_clusters.cluster_heads) {
-                int64_t d = tree_state.read_cluster_dists[i].second;
-                tree_state.read_cluster_dists[i].second = d == -1 ? -1
-                                                             : d + dist_to_end;
-                chain_clusters.best_right = min_not_minus_one(chain_clusters.best_right,
-                                            tree_state.read_cluster_dists[i].second);
+            chain_clusters.fragment_best_right = -1;
+            chain_clusters.read_best_right.assign(tree_state.all_seeds->size(), -1);
+            int64_t last_dist = last_rank == 0 ? 0 : chain_index.prefix_sum[last_rank] - 1;
+            int64_t dist_to_end = chain_index.chainLength() - last_dist - last_len;
+            for (pair<size_t, size_t> cluster_head : chain_clusters.read_cluster_heads) {
+                int64_t d = tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second;
+                tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second = d == -1 ? -1: d + dist_to_end;
+                chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right,
+                                       tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second);
+                chain_clusters.read_best_right[cluster_head.first] = min_not_minus_one(chain_clusters.read_best_right[cluster_head.first],
+                                       tree_state.read_cluster_dists[cluster_head.first][cluster_head.second].second);
             }
         }
 
@@ -936,98 +1005,100 @@ cerr << "  Combining this cluster from the right" << endl;
             //looping around the chain
             //
             int64_t first_length = chain_index.prefix_sum[0]-1;
-            vector<size_t> to_erase; //old cluster group ids
+            vector<pair<size_t, size_t>> to_erase; //old cluster group ids
             //New cluster- there will be at most one new cluster to add
-            size_t combined_cluster = -1;
+            vector<size_t> combined_cluster (tree_state.all_seeds->size(), -1);
             size_t fragment_combined_cluster = -1;
 
-            for (size_t i : chain_clusters.cluster_heads) {
+            for (pair<size_t, size_t> cluster_head : chain_clusters.read_cluster_heads) {
                 //For each chain cluster
-                pair<int64_t, int64_t>& chain_dists = tree_state.read_cluster_dists[i];
+                size_t read_num = cluster_head.first;
+                pair<int64_t, int64_t>& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second];
 
-                if ((chain_dists.second != -1 && chain_clusters.best_left != -1 &&
-                     chain_dists.second + chain_clusters.best_left - first_length - 1
+                if ((chain_dists.second != -1 && chain_clusters.read_best_left[read_num] != -1 &&
+                     chain_dists.second + chain_clusters.read_best_left[read_num] - first_length - 1
                                                 <= tree_state.read_distance_limit) ||
-                   (chain_dists.first != -1 && chain_clusters.best_right != -1 &&
-                      chain_dists.first + chain_clusters.best_right - first_length - 1
+                   (chain_dists.first != -1 && chain_clusters.read_best_right[read_num] != -1 &&
+                      chain_dists.first + chain_clusters.read_best_right[read_num] - first_length - 1
                                                 <= tree_state.read_distance_limit)){
                     //If this chain cluster is in the combined cluster
-                    if (combined_cluster == -1) {
-                        combined_cluster = i;
+                    if (combined_cluster[read_num] == -1) {
+                        combined_cluster[read_num] = cluster_head.second;
                     } else {
-                        tree_state.read_union_find.union_groups(combined_cluster, i);
+                        tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second);
                         if (tree_state.fragment_distance_limit != 0) {
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i);
+                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second + tree_state.all_seeds->size());
                         }
-                        size_t new_group = tree_state.read_union_find.find_group(i);
-                        if (new_group == i) {
-                            to_erase.push_back(combined_cluster);
+                        size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second);
+                        if (new_group == cluster_head.second) {
+                            to_erase.emplace_back(read_num, combined_cluster[read_num]);
                         } else {
-                            to_erase.push_back(i);
+                            to_erase.emplace_back(read_num, cluster_head.second);
                         }
-                        combined_cluster = new_group;
+                        combined_cluster[read_num] = new_group;
                     }
 
                     if (tree_state.fragment_distance_limit != 0) {
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(i);
+                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second + tree_state.all_seeds->size());
                     }
                 } else if (tree_state.fragment_distance_limit != 0 &&
-                   ((chain_dists.second != -1 && chain_clusters.best_left != -1 &&
-                     chain_dists.second + chain_clusters.best_left - first_length - 1
+                   ((chain_dists.second != -1 && chain_clusters.fragment_best_left != -1 &&
+                     chain_dists.second + chain_clusters.fragment_best_left - first_length - 1
                                                 <= tree_state.fragment_distance_limit) ||
-                   (chain_dists.first != -1 && chain_clusters.best_right != -1 &&
-                      chain_dists.first + chain_clusters.best_right - first_length - 1
+                   (chain_dists.first != -1 && chain_clusters.fragment_best_right != -1 &&
+                      chain_dists.first + chain_clusters.fragment_best_right - first_length - 1
                                                 <= tree_state.fragment_distance_limit))){
                     //If we can cluster by fragment
-                    if (fragment_combined_cluster == -1 ) {
-                        fragment_combined_cluster = i;
-                    } else {
-                        tree_state.fragment_union_find.union_groups(fragment_combined_cluster, i);
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(i);
+                    if (fragment_combined_cluster != -1) {
+                        tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]);
                     }
+                    fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
+
                 }
             }
-            for (size_t i : to_erase) {
-                chain_clusters.cluster_heads.erase(i);
+            for (auto c : to_erase) {
+                chain_clusters.read_cluster_heads.erase(c);
             }
             //Don't need to update best left and right distances because
             //a looping chain will be the top level chain
 
         }
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
         cerr << "Found clusters on chain " << chain_index.id_in_parent << endl;
-        cerr << "best left : " << chain_clusters.best_left << " best right : "
-             << chain_clusters.best_right << endl;
-        for (size_t c : chain_clusters.cluster_heads) {
+        cerr << "best left : " << chain_clusters.fragment_best_left << " best right : "
+             << chain_clusters.fragment_best_right << endl;
+        for (pair<size_t, size_t> c : chain_clusters.read_cluster_heads) {
             cerr << "\t";
-            for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) {
-                if (tree_state.read_union_find.find_group(x) == c) {
-                    cerr << tree_state.seeds->at(x) << " ";
+            for (size_t x = 0 ; x < tree_state.all_seeds->size() ; x++) {
+                if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                    cerr << tree_state.all_seeds->at(c.first)[x] << " ";
                 }
             }
             cerr << endl;
         }
         bool got_left = false;
         bool got_right = false;
-        for (size_t c : chain_clusters.cluster_heads) {
-            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c];
+        for (pair<size_t, size_t> c : chain_clusters.read_cluster_heads) {
+            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
             if (!chain_index.is_looping_chain){
-                assert(dists.first == -1 || dists.first >= chain_clusters.best_left);
-                assert(dists.second == -1 || dists.second >= chain_clusters.best_right);
+                assert(dists.first == -1 || dists.first >= chain_clusters.fragment_best_left);
+                assert(dists.second == -1 || dists.second >= chain_clusters.fragment_best_right);
+                assert(dists.first == -1 || dists.first >= chain_clusters.read_best_left[c.first]);
+                assert(dists.second == -1 || dists.second >= chain_clusters.read_best_right[c.first]);
             }
-            if (dists.first == chain_clusters.best_left) {got_left = true;}
-            if (dists.second == chain_clusters.best_right) {got_right = true;}
-            cerr << "\t" << c << ": left: " << dists.first << " right : "
+            if (dists.first == chain_clusters.fragment_best_left) {got_left = true;}
+            if (dists.second == chain_clusters.fragment_best_right) {got_right = true;}
+            cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : "
                  << dists.second << endl;
         }
         if (!chain_index.is_looping_chain) {
             assert(got_left);
             assert(got_right);
         }
-        for (size_t group_id : chain_clusters.cluster_heads) {
+        for (pair<size_t, size_t> group_id : chain_clusters.read_cluster_heads) {
 
-            assert (group_id == tree_state.read_union_find.find_group(group_id));
+            assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second));
         }
 #endif
 
@@ -1042,66 +1113,69 @@ cerr << "  Combining this cluster from the right" << endl;
          * Nodes have not yet been clustered */
         MinimumDistanceIndex::SnarlIndex& snarl_index =
                                         dist_index.snarl_indexes[snarl_index_i];
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
         cerr << "Finding clusters on snarl number " << snarl_index_i
              << " headed by node " << snarl_index.id_in_parent << endl;
 #endif
 
         //Keep track of all clusters on this snarl
-        NodeClusters snarl_clusters;
+        NodeClusters snarl_clusters(tree_state.all_seeds->size());
 
         auto combine_clusters = [&] (size_t& new_group, size_t& combined_group,
-                                    size_t& fragment_combined_group, int64_t dist,
-                                    pair<int64_t, int64_t>& dists){
+                                    size_t& fragment_combined_group, int64_t read_dist,
+                                    int64_t fragment_dist,
+                                    pair<int64_t, int64_t>& end_dists, size_t read_num){
             //Helper function to compare and combine clusters in two nodes of the same snarl
-            //If the distance (dist) between two clusters is small enough, then combine them
+            //If the distance between two clusters is small enough, then combine them
             //for the read clusters and, if applicable, for the fragment clusters
             //Updates the distances stored for the read clusters
-            if (dist <= tree_state.read_distance_limit) {
-                //If the clusters are close enough to combine the reads
+            if (read_dist <= tree_state.read_distance_limit) {
+                //If the clusters are close enough to combine in the read
                 if (tree_state.fragment_distance_limit != 0) {
                     if (fragment_combined_group != -1) {
                         //Also combine fragment clusters
-                        tree_state.fragment_union_find.union_groups(new_group, fragment_combined_group);
+                        tree_state.fragment_union_find.union_groups(new_group+tree_state.read_index_offsets[read_num], 
+                                                                    fragment_combined_group);
                     }
-                    fragment_combined_group = tree_state.fragment_union_find.find_group(new_group);
+                    fragment_combined_group = tree_state.fragment_union_find.find_group(new_group+tree_state.read_index_offsets[read_num]);
                 }
                 if (combined_group == -1) {
-                    snarl_clusters.cluster_heads.insert(new_group);
-                    tree_state.read_cluster_dists[new_group] = dists;
+                    snarl_clusters.read_cluster_heads.emplace(read_num,new_group);
+                    tree_state.read_cluster_dists[read_num][new_group] = end_dists;
                     combined_group = new_group;
                 } else {
-                    //Combine the clusters
+                    //Combine the clusters within the same read
 
-                    combined_group = tree_state.read_union_find.find_group(combined_group);
-                    pair<int64_t, int64_t>old_dists = tree_state.read_cluster_dists[combined_group];
-                    tree_state.read_union_find.union_groups(new_group, combined_group);
+                    combined_group = tree_state.read_union_find[read_num].find_group(combined_group);
+                    pair<int64_t, int64_t>old_dists = tree_state.read_cluster_dists[read_num][combined_group];
+                    tree_state.read_union_find[read_num].union_groups(new_group, combined_group);
 
                     //Update distances and cluster head of new cluster
-                    size_t new_g = tree_state.read_union_find.find_group(new_group);
+                    size_t new_g = tree_state.read_union_find[read_num].find_group(new_group);
                     if (new_g != new_group) {
-                        snarl_clusters.cluster_heads.erase(new_group);
+                        snarl_clusters.read_cluster_heads.erase(make_pair(read_num,new_group));
                     }
                     if (new_g != combined_group) {
-                        snarl_clusters.cluster_heads.erase(combined_group);
+                        snarl_clusters.read_cluster_heads.erase(make_pair(read_num,combined_group));
                     }
-                    snarl_clusters.cluster_heads.insert(new_g);
-                    dists = make_pair(
-                                min_not_minus_one(dists.first, old_dists.first),
-                                min_not_minus_one(dists.second, old_dists.second));
-                    tree_state.read_cluster_dists[new_g] = dists;
+                    snarl_clusters.read_cluster_heads.emplace(read_num,new_g);
+                    end_dists = make_pair(
+                                min_not_minus_one(end_dists.first, old_dists.first),
+                                min_not_minus_one(end_dists.second, old_dists.second));
+                    tree_state.read_cluster_dists[read_num][new_g] = end_dists;
                     new_group = new_g;
                     combined_group = new_g;
                 }
 
             } else if (tree_state.fragment_distance_limit != 0
-                  && dist <= tree_state.fragment_distance_limit) {
+                  && fragment_dist <= tree_state.fragment_distance_limit) {
 
                 //Same fragment
                 if (fragment_combined_group == -1) {
                     fragment_combined_group = new_group;
                 } else {
-                    tree_state.fragment_union_find.union_groups(new_group, fragment_combined_group);
+                    tree_state.fragment_union_find.union_groups(
+                            new_group + tree_state.read_index_offsets[read_num], fragment_combined_group);
                     fragment_combined_group = tree_state.fragment_union_find.find_group(new_group);
                 }
             }
@@ -1118,7 +1192,7 @@ cerr << "  Combining this cluster from the right" << endl;
 
         //Maps each cluster of child nodes to its left and right distances
         //of the node its on
-        hash_map<size_t, pair<int64_t, int64_t>> old_dists;
+        hash_map<pair<size_t,size_t>, pair<int64_t, int64_t>> old_dists;
 
         for (size_t i = 0; i < child_nodes.size() ; i++) {
             //Go through each child node of the netgraph and get clusters
@@ -1147,7 +1221,7 @@ cerr << "  Combining this cluster from the right" << endl;
             //Represents all the clusters on this child node
             NodeClusters& curr_child_clusters = child_nodes[i].second;
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
             cerr << "Finding distances to parent snarl " << snarl_index_i
                  << " ends from child " << i << "/" << child_nodes.size() << endl;
             cerr << "Child is " << typeToString(child.node_type) << " number "
@@ -1155,13 +1229,13 @@ cerr << "  Combining this cluster from the right" << endl;
             cerr << "Node rank is " << node_rank << " fwd, " << rev_rank
                  << " rev of " << snarl_index.num_nodes * 2 << endl;
             cerr << "Clusters at this child:" << endl;
-            for (size_t c : child_nodes[i].second.cluster_heads) {
-                cerr << "\tdist left: " << tree_state.read_cluster_dists[c].first
-                << " dist right: " << tree_state.read_cluster_dists[c].second << endl;
+            for (pair<size_t, size_t> c : child_nodes[i].second.read_cluster_heads) {
+                cerr << "\tdist left: " << tree_state.read_cluster_dists[c.first][c.second].first
+                << " dist right: " << tree_state.read_cluster_dists[c.first][c.second].second << endl;
                 cerr << "\t\t";
-                for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) {
-                    if (tree_state.read_union_find.find_group(x) == c) {
-                        cerr << tree_state.seeds->at(x) << " ";
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                    if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
                     }
                 }
                 cerr << endl;
@@ -1171,33 +1245,38 @@ cerr << "  Combining this cluster from the right" << endl;
             assert(node_rank != numeric_limits<size_t>::max());
 #endif
 
-            vector<size_t> children_i(
-                  make_move_iterator(curr_child_clusters.cluster_heads.begin()),
-                  make_move_iterator(curr_child_clusters.cluster_heads.end()));
+            vector<pair<size_t, size_t>> children_i(
+                  make_move_iterator(curr_child_clusters.read_cluster_heads.begin()),
+                  make_move_iterator(curr_child_clusters.read_cluster_heads.end()));
             for (size_t c_i = 0 ; c_i < children_i.size() ; c_i ++) {
                 //for each cluster of child node i, find the distances to the
                 //ends of the snarl
 
-                size_t c = children_i[c_i];
+                pair<size_t, size_t> child_cluster_head = children_i[c_i];
 
-                pair<int64_t, int64_t> dists_c= tree_state.read_cluster_dists[c];
-                old_dists[c] = dists_c;
+                pair<int64_t, int64_t> dists_c = tree_state.read_cluster_dists[child_cluster_head.first][child_cluster_head.second];
+                old_dists[child_cluster_head] = dists_c;
 
+                //TODO: Do this only once
                 pair<int64_t, int64_t> new_dists = snarl_index.distToEnds(node_rank,
                                         dists_c.first,dists_c.second);
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
 cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_parent
      << " : " << new_dists.first << " " << new_dists.second << endl;
 #endif
 
-                snarl_clusters.best_left =min_not_minus_one(snarl_clusters.best_left,
-                                                        new_dists.first);
-                snarl_clusters.best_right = min_not_minus_one(
-                                   snarl_clusters.best_right, new_dists.second);
+                snarl_clusters.fragment_best_left =min_not_minus_one(
+                                   snarl_clusters.fragment_best_left,new_dists.first);
+                snarl_clusters.fragment_best_right = min_not_minus_one(
+                                   snarl_clusters.fragment_best_right, new_dists.second);
+                snarl_clusters.read_best_left[child_cluster_head.first] =min_not_minus_one(
+                                   snarl_clusters.read_best_left[child_cluster_head.first], new_dists.first);
+                snarl_clusters.read_best_right[child_cluster_head.first] = min_not_minus_one(
+                                   snarl_clusters.read_best_right[child_cluster_head.first], new_dists.second);
 
 
-                snarl_clusters.cluster_heads.insert(c);
-                tree_state.read_cluster_dists[c] = new_dists;
+                snarl_clusters.read_cluster_heads.insert(child_cluster_head);
+                tree_state.read_cluster_dists[child_cluster_head.first][child_cluster_head.second] = new_dists;
             }
 
 
@@ -1209,19 +1288,19 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa
                 NodeClusters& other_node_clusters = child_nodes[j].second;
 
                 id_t other_node_id = other_node.id_in_parent(dist_index);
+                //Rank of this node in the snarl
+                size_t other_rank = other_node.rank_in_parent(dist_index,
+                                                              other_node_id);
+                size_t other_rev = other_rank % 2 == 0
+                                    ? other_rank + 1 : other_rank - 1;
 
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
                 cerr << "Other net graph node is " << typeToString(other_node.node_type)
                     << " headed by node " << other_node_id;
 
 
 #endif
 
-                //Rank of this node in the snarl
-                size_t other_rank = other_node.rank_in_parent(dist_index,
-                                                              other_node_id);
-                size_t other_rev = other_rank % 2 == 0
-                                    ? other_rank + 1 : other_rank - 1;
 
                 //Find distance from each end of current node (i) to
                 //each end of other node (j)
@@ -1233,7 +1312,7 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa
                                                     node_rank, other_rank);
                 int64_t dist_r_r = snarl_index.snarlDistance(
                                                      node_rank, other_rev);
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
 cerr << "\t distances between ranks " << node_rank << " and " << other_rank
      << ": " << dist_l_l << " " << dist_l_r << " " << dist_r_l << " "
      << dist_r_r << endl;
@@ -1241,10 +1320,10 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
 
                 //group ids of clusters combined between node i left and
                 //node j left, etc
-                size_t group_l_l = -1;
-                size_t group_l_r = -1;
-                size_t group_r_l = -1;
-                size_t group_r_r = -1;
+                vector<size_t> group_l_l (tree_state.all_seeds->size(), -1);
+                vector<size_t> group_l_r (tree_state.all_seeds->size(), -1);
+                vector<size_t> group_r_l (tree_state.all_seeds->size(), -1);
+                vector<size_t> group_r_r (tree_state.all_seeds->size(), -1);
                 size_t fragment_group_l_l = -1;
                 size_t fragment_group_l_r = -1;
                 size_t fragment_group_r_l = -1;
@@ -1254,125 +1333,141 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                    && ((tree_state.fragment_distance_limit == 0 &&
                          MinimumDistanceIndex::minPos({dist_l_l, dist_l_r,
                             dist_r_l, dist_r_r})-2 <= tree_state.read_distance_limit
-                   && min_not_minus_one(curr_child_clusters.best_left, curr_child_clusters.best_right)-2
+                   && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2
                                                 <= tree_state.read_distance_limit) ||
                        (tree_state.fragment_distance_limit != 0 &&
                             MinimumDistanceIndex::minPos({dist_l_l, dist_l_r,
                             dist_r_l, dist_r_r})-2 <= tree_state.fragment_distance_limit
-                   && min_not_minus_one(curr_child_clusters.best_left, curr_child_clusters.best_right)-2
+                   && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2
                                                 <= tree_state.fragment_distance_limit)
                                                 )) {
                     //If the two nodes are reachable
                     for (size_t c_i = 0 ; c_i < children_i.size() ; c_i ++) {
                         //for each cluster of child node i
 
-                        size_t c = children_i[c_i];
-                        size_t c_group = tree_state.read_union_find.find_group(c);
+                        pair<size_t, size_t> child_cluster_head = children_i[c_i];
+                        size_t read_num = child_cluster_head.first;
+                        size_t c_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second);
 
-                        pair<int64_t, int64_t> new_dists;
-                        pair<int64_t, int64_t> dists_c;
-
-                         dists_c = old_dists[c];
-                         new_dists = tree_state.read_cluster_dists[c_group];
+                        pair<int64_t, int64_t> new_dists = tree_state.read_cluster_dists[read_num][c_group];
+                        pair<int64_t, int64_t> dists_c = old_dists[child_cluster_head];
 
 
                         if (dist_l_l != -1 && dists_c.first != -1
-                                 && other_node_clusters.best_left != -1 ) {
-                            //If cluster c can be combined with clusters in j
+                                 && other_node_clusters.fragment_best_left != -1 ) {
+                            //If cluster child_cluster_head can be combined with clusters in j
                             //from the left of both of them
-                            combine_clusters(c_group, group_l_l, fragment_group_l_l,
-                                  dist_l_l + dists_c.first + other_node_clusters.best_left-1, new_dists);
+                            combine_clusters(c_group, group_l_l[read_num], fragment_group_l_l,
+                                  dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1,
+                                  dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1,
+                                  new_dists, read_num);
                         }
 
                         if (dist_l_r != -1 && dists_c.first != -1
-                            && other_node_clusters.best_right != -1 ) {
+                            && other_node_clusters.fragment_best_right != -1 ) {
                             //If it can be combined from the left to the right of j
-                            combine_clusters(c_group, group_l_r, fragment_group_l_r,
-                                 dist_l_r + dists_c.first + other_node_clusters.best_right-1, new_dists);
+                            combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r,
+                                 dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1,
+                                 dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1,
+                                 new_dists, read_num);
                         }
                         if (dist_r_l != -1 && dists_c.second != -1
-                            && other_node_clusters.best_left != -1 ) {
-                            combine_clusters(c_group, group_r_l, fragment_group_r_l,
-                                dist_r_l + dists_c.second + other_node_clusters.best_left-1, new_dists);
+                            && other_node_clusters.fragment_best_left != -1 ) {
+                            combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l,
+                                dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1,
+                                dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1, 
+                                new_dists, read_num);
                         }
                         if (dist_r_r != -1 && dists_c.second != -1
-                            && other_node_clusters.best_right != -1 ) {
-                            combine_clusters(c_group, group_r_r, fragment_group_r_r,
-                                dist_r_r + dists_c.second + other_node_clusters.best_right-1, new_dists);
+                            && other_node_clusters.fragment_best_right != -1 ) {
+                            combine_clusters(c_group, group_r_r[read_num], fragment_group_r_r,
+                                dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1,
+                                dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1,
+                                new_dists, read_num);
                         }
 
                     }
                     //Go through children of j
-                    vector<size_t> children_j(
-                             make_move_iterator(other_node_clusters.cluster_heads.begin()),
-                             make_move_iterator(other_node_clusters.cluster_heads.end()));
+                    vector<pair<size_t, size_t>> children_j(
+                             make_move_iterator(other_node_clusters.read_cluster_heads.begin()),
+                             make_move_iterator(other_node_clusters.read_cluster_heads.end()));
 
                     for (size_t k_i = 0 ; k_i < children_j.size() ; k_i++){
-                        size_t k = children_j[k_i];
                         //For each cluster of child j, find which overlaps with
                         //clusters of i
-                        //k will already be part of a cluster in
+                        //child_cluster_head will already be part of a cluster in
                         //snarlcluster heads but since we need to know the node
                         //that the snarl is on we can't just loop through
                         //snarl_cluster heads
-                        pair<int64_t, int64_t>& dist_bounds_k = old_dists[k];
-                        size_t k_group = tree_state.read_union_find.find_group(k);
-                        pair<int64_t, int64_t> dists_k = tree_state.read_cluster_dists[k_group];
+                        pair<size_t,size_t> child_cluster_head = children_j[k_i];
+                        size_t read_num = child_cluster_head.first;
+                        pair<int64_t, int64_t>& dist_bounds_k = old_dists[child_cluster_head];
+                        size_t k_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second);
+                        pair<int64_t, int64_t> dists_k = tree_state.read_cluster_dists[read_num][k_group];
 
 
-                        if (dist_l_l != -1 && curr_child_clusters.best_left != -1
+                        if (dist_l_l != -1 && curr_child_clusters.read_best_left[read_num] != -1
                            && dist_bounds_k.first != -1 ){
 
-                            combine_clusters(k_group, group_l_l, fragment_group_l_l,
-                                dist_l_l + curr_child_clusters.best_left + dist_bounds_k.first-1, dists_k);
+                            combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l,
+                                dist_l_l + curr_child_clusters.fragment_best_left + dist_bounds_k.first-1,
+                                dist_l_l + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.first-1, 
+                                dists_k, read_num);
                         }
-                        if (dist_l_r != -1 && curr_child_clusters.best_left != -1
+                        if (dist_l_r != -1 && curr_child_clusters.read_best_left[read_num] != -1
                              && dist_bounds_k.second != -1  ) {
 
-                            combine_clusters(k_group, group_l_r, fragment_group_l_r,
-                               dist_l_r + curr_child_clusters.best_left + dist_bounds_k.second-1, dists_k);
+                            combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r,
+                               dist_l_r + curr_child_clusters.fragment_best_left + dist_bounds_k.second-1, 
+                               dist_l_r + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.second-1, 
+                               dists_k, read_num);
                         }
-                        if (dist_r_l != -1 && curr_child_clusters.best_right != -1
+                        if (dist_r_l != -1 && curr_child_clusters.read_best_right[read_num] != -1
                             && dist_bounds_k.first != -1  ) {
 
-                            combine_clusters(k_group, group_r_l, fragment_group_r_l,
-                                dist_r_l + curr_child_clusters.best_right + dist_bounds_k.first-1, dists_k);
+                            combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l,
+                                dist_r_l + curr_child_clusters.fragment_best_right + dist_bounds_k.first-1,
+                                dist_r_l + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.first-1,
+                                dists_k,read_num);
                         }
-                        if (dist_r_r != -1 && curr_child_clusters.best_right != -1
+                        if (dist_r_r != -1 && curr_child_clusters.read_best_right[read_num] != -1
                            && dist_bounds_k.second != -1 ) {
 
-                            combine_clusters(k_group, group_r_r, fragment_group_r_r,
-                               dist_r_r + curr_child_clusters.best_right + dist_bounds_k.second-1, dists_k);
+                            combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r,
+                               dist_r_r + curr_child_clusters.fragment_best_right + dist_bounds_k.second-1, 
+                               dist_r_r + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.second-1, 
+                               dists_k, read_num);
                         }
                     }
                 }
             }
         }
-#ifdef DEBUG
+#ifdef DEBUG_CLUSTER
         cerr << "Found clusters on snarl number " << snarl_index_i << " headed by"
              << snarl_index.id_in_parent << endl;
-        cerr << "    with best left and right values: " << snarl_clusters.best_left << " "
-             << snarl_clusters.best_right << endl;
+        cerr << "    with best left and right values: " << snarl_clusters.fragment_best_left << " "
+             << snarl_clusters.fragment_best_right << endl;
         bool got_left = false;
         bool got_right = false;
-        for (size_t c : snarl_clusters.cluster_heads) {
-            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c];
-            if (dists.first == snarl_clusters.best_left) {got_left = true;}
-            if (dists.second == snarl_clusters.best_right) {got_right = true;}
-            cerr << "\t" << c << ": left: " << dists.first << " right : "
+        for (pair<size_t, size_t> c : snarl_clusters.read_cluster_heads) {
+            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+            if (dists.first == snarl_clusters.fragment_best_left) {got_left = true;}
+            if (dists.second == snarl_clusters.fragment_best_right) {got_right = true;}
+            cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : "
                  << dists.second << endl;
             cerr << "\t\t";
-            for (size_t x = 0 ; x < tree_state.seeds->size() ; x++) {
-                if (tree_state.read_union_find.find_group(x) == c) {
-                    cerr << tree_state.seeds->at(x) << " ";
+            for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                    cerr << tree_state.all_seeds->at(c.first)[x] << " ";
                 }
             }
             cerr << endl;
         }
         assert(got_left);
         assert(got_right);
-        for (size_t group_id : snarl_clusters.cluster_heads) {
-            assert (group_id == tree_state.read_union_find.find_group(group_id));
+
+        for (pair<size_t, size_t> group_id : snarl_clusters.read_cluster_heads) {
+            assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second));
         }
 #endif
         return snarl_clusters;
diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp
index 03c4d20346c..d90410798ae 100644
--- a/src/seed_clusterer.hpp
+++ b/src/seed_clusterer.hpp
@@ -14,6 +14,8 @@ class SnarlSeedClusterer {
 
         SnarlSeedClusterer(MinimumDistanceIndex& dist_index);
 
+        typedef vector<vector<size_t>> cluster_group_t;
+
         ///Given a vector of seeds (pos_t) and a distance limit, 
         //cluster the seeds such that two seeds whose minimum distance
         //between them (including both of the positions) is less than
@@ -21,8 +23,7 @@ class SnarlSeedClusterer {
         //
         //Returns a vector of clusters. Each cluster is a vector of
         //indices into seeds
-        vector<vector<size_t>> cluster_seeds ( 
-                vector<pos_t> seeds, int64_t read_distance_limit) const;
+        cluster_group_t cluster_seeds ( vector<pos_t> seeds, int64_t read_distance_limit) const;
         
         ///The same thing, but for paired end reads.
         //Given seeds from multiple reads of a fragment, cluster each set of seeds
@@ -32,7 +33,7 @@ class SnarlSeedClusterer {
         //The read clusters refer to seeds by their indexes in the input vectors of seeds
         //The fragment clusters give seeds the index they would get if the vectors of
         // seeds were appended to each other in the order given
-        tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> cluster_seeds ( 
+        tuple<vector<cluster_group_t>, cluster_group_t> cluster_seeds ( 
                 vector<vector<pos_t>> all_seeds,
                 int64_t read_distance_limit, int64_t fragment_distance_limit=0) const;
 
@@ -41,6 +42,7 @@ class SnarlSeedClusterer {
         MinimumDistanceIndex& dist_index;
 
         enum ChildNodeType {CHAIN, SNARL, NODE};
+
         
         static inline string typeToString(ChildNodeType t) {
             switch (t) {
@@ -105,18 +107,24 @@ class SnarlSeedClusterer {
             // snarl/chain that is a node the parent snarl's netgraph,
             // or a snarl in a chain
 
+
             //set of the indices of heads of clusters (group ids in the 
             //union find)
-            hash_set<size_t> cluster_heads;
+            //TODO: Add cluster distances here
+            //pair of read index, seed index
+            hash_set<pair<size_t,size_t>> read_cluster_heads;
 
             //The shortest distance from any seed in any cluster to the 
             //left/right end of the snarl tree node that contains these
             //clusters
-            int64_t best_left;
-            int64_t best_right;
-
-            NodeClusters() :
-                best_left(-1), best_right(-1) {}
+            int64_t fragment_best_left;
+            int64_t fragment_best_right;
+            vector<int64_t> read_best_left;
+            vector<int64_t> read_best_right;
+
+            NodeClusters(size_t read_count) :
+                fragment_best_left(-1), fragment_best_right(-1),
+                read_best_left(read_count, -1), read_best_right(read_count, -1){}
         };
 
 
@@ -130,7 +138,7 @@ class SnarlSeedClusterer {
             vector<vector<pos_t>>* all_seeds; 
 
             //Vector of the offset of indices for each seed
-            vector<size_t> seed_index_offsets;
+            vector<size_t> read_index_offsets;
 
             //The minimum distance between nodes for them to be put in the
             //same cluster
@@ -148,7 +156,8 @@ class SnarlSeedClusterer {
             //of the netgraph node of the cluster it belongs to
             //These values are only relevant for seeds that represent a cluster
             //in union_find_reads
-            vector<pair<int64_t, int64_t>> read_cluster_dists;
+            vector<vector<pair<int64_t, int64_t>>> read_cluster_dists;
+            vector<pair<int64_t, int64_t>> fragment_cluster_dists;
 
 
 
@@ -158,7 +167,7 @@ class SnarlSeedClusterer {
             //Maps each node to a vector of the seeds that are contained in it
             //seeds are represented by indexes into the seeds vector
             //The array is sorted.
-            vector<tuple<id_t, size_t, size_t>> node_to_seeds;
+            vector<vector<pair<id_t, size_t>>> node_to_seeds;
 
             //Map from snarl (index into dist_index.snarl_indexes) i
             //to the netgraph nodes contained in the snarl as well as the 
@@ -185,18 +194,23 @@ class SnarlSeedClusterer {
 
             //Constructor takes in a pointer to the seeds and the distance limit 
             TreeState (vector<vector<pos_t>>* all_seeds, int64_t read_distance_limit, 
-                       int64_t fragment_distance_limit) :
+                       int64_t fragment_distance_limit, size_t seed_count) :
                 all_seeds(all_seeds),
-                read_cluster_dists(seeds->size(), make_pair(-1, -1)),
-                read_union_find (seeds->size(), false),
-                fragment_union_find (seeds->size(), false),
+                fragment_cluster_dists(all_seeds->size(), make_pair(-1, -1)),
                 read_distance_limit(read_distance_limit),
-                fragment_distance_limit(fragment_distance_limit){
-                    seed_index_offsets.push_back(0);
-                    for (auto& v : all_seeds) {
-                        size_t offset = seed_index_offsets.back() + v.size();
-                        seed_index_offsets.push_back(offset);
-                    }
+                fragment_distance_limit(fragment_distance_limit),
+                fragment_union_find (seed_count, false) {
+
+                read_index_offsets.push_back(0);
+                size_t total_seeds = 0;
+                for (vector<pos_t>& v : *all_seeds) {
+                    total_seeds += v.size();
+                    size_t offset = read_index_offsets.back() + v.size();
+                    read_index_offsets.push_back(offset);
+                    read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1));
+                    node_to_seeds.emplace_back();
+                    read_union_find.emplace_back(v.size(), false);
+                }
             }
         };
 
diff --git a/src/subcommand/cluster_main.cpp b/src/subcommand/cluster_main.cpp
index f44945f60c5..ea8ae030af0 100644
--- a/src/subcommand/cluster_main.cpp
+++ b/src/subcommand/cluster_main.cpp
@@ -257,8 +257,7 @@ int main_cluster(int argc, char** argv) {
             // Cluster the seeds. Get sets of input seed indexes that go together.
             // Make sure to time it.
             std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
-            tuple<vector<vector<size_t>>,vector<vector<size_t>>> paired_clusters = clusterer.cluster_seeds(seeds, distance_limit);
-            vector<vector<size_t>> clusters = std::move(std::get<0>(paired_clusters));
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, distance_limit);
             std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
             std::chrono::duration<double> elapsed_seconds = end-start;
             
diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp
index 0e2899224c6..25d7a287e37 100644
--- a/src/unittest/seed_clusterer.cpp
+++ b/src/unittest/seed_clusterer.cpp
@@ -75,9 +75,7 @@ namespace unittest {
                 seeds.push_back(make_pos_t(n, false, 0));
             }
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 10); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, 10); 
             REQUIRE(clusters.size() == 1); 
 
         }
@@ -92,9 +90,7 @@ namespace unittest {
             }
 
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 7); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, 7); 
             vector<hash_set<size_t>> cluster_sets;
             for (vector<size_t> v : clusters) {
                 hash_set<size_t> h;
@@ -123,7 +119,8 @@ namespace unittest {
         }
         SECTION( "One fragment cluster" ) {
  
-            vector<id_t> seed_nodes( {2, 3, 4, 7, 8, 10, 11});
+            vector<id_t> seed_nodes( {2, 3, 4});
+            vector<id_t> seed_nodes1({7, 8, 10, 11});
             //Clusters should be {2, 3, 4}, {7, 8, 10, 11}
             //One fragment cluster
             //Distance from pos on 4 to pos on 7 is 8, including one position
@@ -131,23 +128,40 @@ namespace unittest {
             for (id_t n : seed_nodes) {
                 seeds.push_back(make_pos_t(n, false, 0));
             }
+            vector<pos_t> seeds1;
+            for (id_t n : seed_nodes1) {
+                seeds.push_back(make_pos_t(n, false, 0));
+            }
+            vector<vector<pos_t>> all_seeds;
+            all_seeds.push_back(seeds);
+            all_seeds.push_back(seeds1);
 
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 7, 15); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            tuple<vector<vector<vector<size_t>>>, vector<vector<size_t>>> paired_clusters = 
+                clusterer.cluster_seeds(all_seeds, 7, 15); 
+            vector<vector<vector<size_t>>> read_clusters = std::get<0>(paired_clusters);
+            //Should be [[[0,1,2]],[[3,4,5,6]]] 
             vector<vector<size_t>> fragment_clusters = std::get<1>(paired_clusters);
             vector<hash_set<size_t>> cluster_sets;
-            for (vector<size_t> v : clusters) {
+            for (vector<size_t> v : read_clusters[0]) {
                 hash_set<size_t> h;
                 for (size_t s : v) {
                     h.insert(s);
                 }
                 cluster_sets.push_back(h);
             }
-            REQUIRE( clusters.size() == 2);
+            for (vector<size_t> v : read_clusters[1]) {
+                hash_set<size_t> h;
+                for (size_t s : v) {
+                    h.insert(s);
+                }
+                cluster_sets.push_back(h);
+            }
+            REQUIRE( read_clusters.size() == 2);
+            REQUIRE( (read_clusters[0].size() == 3 || read_clusters[1].size() == 3));
+            REQUIRE( (read_clusters[0].size() == 4 || read_clusters[1].size() == 4));
             REQUIRE( fragment_clusters.size() == 1);
-            REQUIRE (( (cluster_sets[0].count(0) == 1 &&
+            REQUIRE (((cluster_sets[0].count(0) == 1 &&
                        cluster_sets[0].count(1) == 1 &&
                        cluster_sets[0].count(2) == 1 &&
                        cluster_sets[1].count(3) == 1 &&
@@ -166,19 +180,29 @@ namespace unittest {
         }
         SECTION( "Two fragment clusters" ) {
  
-            vector<id_t> seed_nodes( {2, 3, 4, 7, 8, 10, 11});
+            vector<id_t> seed_nodes( {2, 3, 4});
+            vector<id_t> seed_nodes1({7, 8, 10, 11});
             //Fragment clusters should be {2, 3, 4}, {7, 8, 10, 11}
             //Distance from pos on 4 to pos on 7 is 8, including one position
             vector<pos_t> seeds;
             for (id_t n : seed_nodes) {
                 seeds.push_back(make_pos_t(n, false, 0));
             }
+            vector<pos_t> seeds1;
+            for (id_t n : seed_nodes1) {
+                seeds.push_back(make_pos_t(n, false, 0));
+            }
+            vector<vector<pos_t>> all_seeds;
+            all_seeds.push_back(seeds);
+            all_seeds.push_back(seeds1);
 
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 2, 7); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            tuple<vector<vector<vector<size_t>>>, vector<vector<size_t>>> paired_clusters = 
+                clusterer.cluster_seeds(all_seeds, 2, 7); 
+            vector<vector<vector<size_t>>> read_clusters = std::get<0>(paired_clusters);
+            // read_clusters = [ [[0,1,2]],[[3,4,5,6]] ]
             vector<vector<size_t>> fragment_clusters = std::get<1>(paired_clusters);
+            // fragment_clusters = [ [0,1,2], [3,4,5,6] ]
             vector<hash_set<size_t>> fragment_cluster_sets;
             for (vector<size_t> v : fragment_clusters) {
                 hash_set<size_t> h;
@@ -187,7 +211,7 @@ namespace unittest {
                 }
                 fragment_cluster_sets.push_back(h);
             }
-            REQUIRE( clusters.size() == 3);
+            REQUIRE( read_clusters.size() == 2);
             REQUIRE( fragment_clusters.size() == 2);
             REQUIRE (( (fragment_cluster_sets[0].count(0) == 1 &&
                         fragment_cluster_sets[0].count(1) == 1 &&
@@ -251,9 +275,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(4, false, 0));
 
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 13); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, 13); 
 
             REQUIRE( clusters.size() == 1);
         }
@@ -262,9 +284,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(3, false, 0));
             seeds.push_back(make_pos_t(11, false, 9));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 8); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, 8); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -313,9 +333,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(7, false, 0));
             seeds.push_back(make_pos_t(6, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 20); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 20); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -325,9 +343,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(2, false, 0));
             seeds.push_back(make_pos_t(6, true, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 20); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 20); 
 
 
         }
@@ -336,9 +352,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(8, false, 0));
             seeds.push_back(make_pos_t(6, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 20); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 20); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -411,9 +425,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(6, false, 0));
             seeds.push_back(make_pos_t(8, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 3); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 3); 
 
             REQUIRE( clusters.size() == 2);
             vector<hash_set<size_t>> cluster_sets;
@@ -455,30 +467,58 @@ namespace unittest {
             seeds.push_back(make_pos_t(14, false, 0));
             seeds.push_back(make_pos_t(15, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 3); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
-            vector<vector<size_t>> fragment_clusters = std::get<1>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 3); 
 
             REQUIRE( clusters.size() == 4);
-            REQUIRE( fragment_clusters.size() == seeds.size());
+
+            vector<vector<pos_t>> all_seeds;
+            all_seeds.push_back(seeds);
+            tuple<vector<vector<vector<size_t>>>, vector<vector<size_t>>> paired_clusters = clusterer.cluster_seeds(all_seeds, 3, 3); 
+            vector<vector<vector<size_t>>> read_clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>>fragment_clusters = std::get<1>(paired_clusters);
+
+            REQUIRE( read_clusters.size() == 1);
+            REQUIRE( read_clusters[0].size() == 1);
+            REQUIRE( fragment_clusters.size() == 4);
 
             //New fragment clusters
+        } SECTION ("Four fragment clusters") {
+            vector<vector<pos_t>> all_seeds;
+            vector<pos_t> seeds;
+            seeds.push_back(make_pos_t(3, false, 0));
+            seeds.push_back(make_pos_t(5, false, 0));
+            seeds.push_back(make_pos_t(16, false, 0));
+            //New cluster
+            seeds.push_back(make_pos_t(6, false, 0));
+            seeds.push_back(make_pos_t(8, false, 0));
+            all_seeds.push_back(seeds);
+            seeds.clear();
+            //New cluster
+            seeds.push_back(make_pos_t(5, false, 8));
+            //New cluster
+            seeds.push_back(make_pos_t(13, false, 1));
+            seeds.push_back(make_pos_t(14, false, 0));
+            seeds.push_back(make_pos_t(15, false, 0));
+            all_seeds.push_back(seeds);
 
-            paired_clusters = clusterer.cluster_seeds(seeds, 3, 3); 
-            clusters = std::get<0>(paired_clusters);
-            fragment_clusters = std::get<1>(paired_clusters);
+            tuple<vector<vector<vector<size_t>>>, vector<vector<size_t>>> paired_clusters = clusterer.cluster_seeds(all_seeds, 3, 3); 
+            vector<vector<vector<size_t>>> read_clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> fragment_clusters = std::get<1>(paired_clusters);
 
-            REQUIRE( clusters.size() == 4);
+            REQUIRE( read_clusters.size() == 2);
+            REQUIRE( read_clusters[0].size() == 2);
+            REQUIRE( read_clusters[1].size() == 2);
             REQUIRE( fragment_clusters.size() == 4);
 
             //New fragment clusters
 
-            paired_clusters = clusterer.cluster_seeds(seeds, 3, 5); 
-            clusters = std::get<0>(paired_clusters);
+            paired_clusters = clusterer.cluster_seeds(all_seeds, 3, 5); 
+            read_clusters = std::get<0>(paired_clusters);
             fragment_clusters = std::get<1>(paired_clusters);
 
-            REQUIRE( clusters.size() == 4);
+            REQUIRE( read_clusters.size() == 2);
+            REQUIRE( read_clusters[0].size() == 2);
+            REQUIRE( read_clusters[1].size() == 2);
             REQUIRE( fragment_clusters.size() == 2);
         }
         SECTION( "Same node, same cluster" ) {
@@ -487,9 +527,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(5, false, 11));
             seeds.push_back(make_pos_t(5, false, 5));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 7); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, 7); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -535,9 +573,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(2, false, 0));
             seeds.push_back(make_pos_t(7, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 10); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters= clusterer.cluster_seeds(seeds, 10); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -549,9 +585,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(7, false, 0));
             seeds.push_back(make_pos_t(4, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 10); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, 10); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -561,9 +595,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(2, false, 0));
             seeds.push_back(make_pos_t(4, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 10); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 10); 
 
 
 
@@ -575,9 +607,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(4, false, 1));
             seeds.push_back(make_pos_t(6, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 5); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters = clusterer.cluster_seeds(seeds, 5); 
 
 
             REQUIRE( clusters.size() == 2);
@@ -585,9 +615,7 @@ namespace unittest {
         SECTION("No clusters") {
             vector<pos_t> seeds;
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 5); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 5); 
 
 
             REQUIRE( clusters.size() == 0);
@@ -641,9 +669,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(3, false, 0));
             seeds.push_back(make_pos_t(9, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 5); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 5); 
 
 
             REQUIRE( clusters.size() == 2);
@@ -693,9 +719,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(3, false, 0));
             seeds.push_back(make_pos_t(8, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 3); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 3); 
 
 
             REQUIRE( clusters.size() == 2);
@@ -707,9 +731,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(2, false, 0));
             seeds.push_back(make_pos_t(7, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 6); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 6); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -721,9 +743,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(8, false, 0));
             seeds.push_back(make_pos_t(10, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 3); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 3); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -770,9 +790,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(3, false, 0));
             seeds.push_back(make_pos_t(4, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 10); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 10); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -782,9 +800,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(5, false, 0));
             seeds.push_back(make_pos_t(3, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 10); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 10); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -795,9 +811,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(3, false, 0));
             seeds.push_back(make_pos_t(8, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 3); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 3); 
 
 
 
@@ -808,9 +822,7 @@ namespace unittest {
             seeds.push_back(make_pos_t(2, false, 0));
             seeds.push_back(make_pos_t(3, false, 0));
 
-            tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                clusterer.cluster_seeds(seeds, 15); 
-            vector<vector<size_t>> clusters = std::get<0>(paired_clusters);
+            vector<vector<size_t>> clusters =  clusterer.cluster_seeds(seeds, 15); 
 
 
             REQUIRE( clusters.size() == 1);
@@ -840,116 +852,127 @@ namespace unittest {
             uniform_int_distribution<int> randSnarlIndex(0, allSnarls.size()-1);
             default_random_engine generator(time(NULL));
             for (size_t k = 0; k < 100 ; k++) {
-                vector<pos_t> seeds;
+                vector<vector<pos_t>> all_seeds;
+                all_seeds.emplace_back();
+                all_seeds.emplace_back();
                 int64_t read_lim = 20;// Distance between read clusters
                 int64_t fragment_lim = 30;// Distance between fragment clusters
-                for (int j = 0; j < 20; j++) {
-                    //Check clusters of j random positions 
-                    const Snarl* snarl1 = allSnarls[randSnarlIndex(generator)];
+                for (size_t read = 0 ; read < 2 ; read ++) {
+                    for (int j = 0; j < 20; j++) {
+                        //Check clusters of j random positions 
+                        const Snarl* snarl1 = allSnarls[randSnarlIndex(generator)];
 
-                    pair<unordered_set<id_t>, unordered_set<edge_t>> contents1 =
-                           snarl_manager.shallow_contents(snarl1, graph, true);
+                        pair<unordered_set<id_t>, unordered_set<edge_t>> contents1 =
+                               snarl_manager.shallow_contents(snarl1, graph, true);
   
-                    vector<id_t> nodes1 (contents1.first.begin(), contents1.first.end());
+                        vector<id_t> nodes1 (contents1.first.begin(), contents1.first.end());
 
 
-                    uniform_int_distribution<int> randNodeIndex1(0,nodes1.size()-1);
+                        uniform_int_distribution<int> randNodeIndex1(0,nodes1.size()-1);
  
-                    id_t nodeID1 = nodes1[randNodeIndex1(generator)];
-                    handle_t node1 = graph.get_handle(nodeID1);
+                        id_t nodeID1 = nodes1[randNodeIndex1(generator)];
+                        handle_t node1 = graph.get_handle(nodeID1);
  
-                    off_t offset1 = uniform_int_distribution<int>(0,graph.get_length(node1) - 1)(generator);
+                        off_t offset1 = uniform_int_distribution<int>(0,graph.get_length(node1) - 1)(generator);
 
-                    pos_t pos = make_pos_t(nodeID1,
-                        uniform_int_distribution<int>(0,1)(generator) == 0,offset1 );
-                    seeds.push_back(pos);
+                        pos_t pos = make_pos_t(nodeID1,
+                            uniform_int_distribution<int>(0,1)(generator) == 0,offset1 );
+                        all_seeds[read].push_back(pos);
 
+                    }
                 }
-                tuple<vector<vector<size_t>>, vector<vector<size_t>>> paired_clusters = 
-                    clusterer.cluster_seeds(seeds, read_lim, fragment_lim); 
-                vector<vector<size_t>> read_clusters = std::get<0>(paired_clusters);
+                tuple<vector<vector<vector<size_t>>>, vector<vector<size_t>>> paired_clusters = 
+                    clusterer.cluster_seeds(all_seeds, read_lim, fragment_lim); 
+                vector<vector<vector<size_t>>> read_clusters = std::get<0>(paired_clusters);
                 vector<vector<size_t>> fragment_clusters = std::get<1>(paired_clusters);
 
 
+                vector<pos_t> ordered_seeds (all_seeds[0]);
+                for (pos_t s : all_seeds[1]){
+                    ordered_seeds.push_back(s);
+                }
                 
-                for (size_t a = 0; a < read_clusters.size(); a++) {
-                    // For each cluster -cluster this cluster to ensure that 
-                    // there is only one
-                    vector<size_t> clust = read_clusters[a];
-                    
-                    structures::UnionFind new_clusters (clust.size(), false);
-
-                    for (size_t i1 = 0 ; i1 < clust.size() ; i1++) {
-                        pos_t pos1 = seeds[clust[i1]];
-                        size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false));
-                        pos_t rev1 = make_pos_t(get_id(pos1), 
-                                            !is_rev(pos1),
-                                            len1 - get_offset(pos1)-1); 
-
-                        for (size_t b = 0 ; b < read_clusters.size() ; b++) {
-                            if (b != a) {
-                                //For each other cluster
-                                vector<size_t> clust2 = read_clusters[b];
-                                for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) {
-                                    //And each position in each other cluster,
-                                    //make sure that this position is far away from i1
-                                    pos_t pos2 = seeds[clust2[i2]];
-                                    size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
-                                    pos_t rev2 = make_pos_t(get_id(pos2), 
-                                                     !is_rev(pos2),
-                                                     len2 - get_offset(pos2)-1); 
-
-                                    int64_t dist1 = dist_index.minDistance(pos1, pos2);
-                                    int64_t dist2 = dist_index.minDistance(pos1, rev2);
-                                    int64_t dist3 = dist_index.minDistance(rev1, pos2);
-                                    int64_t dist4 = dist_index.minDistance(rev1, rev2);
-                                    int64_t dist = MinimumDistanceIndex::minPos({dist1, 
-                                                       dist2, dist3, dist4});
-                                    if ( dist != -1 && dist <= read_lim) {
-                                        dist_index.printSelf();
-                                        graph.serialize_to_file("testGraph");
-                                        cerr << "These should have been in the same read cluster: " ;
-                                        cerr << pos1 << " and " << pos2 << endl;
-                                        cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl;
-                                        REQUIRE(false);
+                for (size_t read_num = 0 ; read_num <= 2 ; read_num ++) {
+                    auto& one_read_clusters = read_clusters[read_num];
+                    for (size_t a = 0; a < one_read_clusters.size(); a++) {
+                        // For each cluster -cluster this cluster to ensure that 
+                        // there is only one
+                        vector<size_t> clust = one_read_clusters[a];
+                        
+                        structures::UnionFind new_clusters (clust.size(), false);
+
+                        for (size_t i1 = 0 ; i1 < clust.size() ; i1++) {
+                            pos_t pos1 = all_seeds[read_num][clust[i1]];
+                            size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false));
+                            pos_t rev1 = make_pos_t(get_id(pos1), 
+                                                !is_rev(pos1),
+                                                len1 - get_offset(pos1)-1); 
+
+                            for (size_t b = 0 ; b < one_read_clusters.size() ; b++) {
+                                if (b != a) {
+                                    //For each other cluster
+                                    vector<size_t> clust2 = one_read_clusters[b];
+                                    for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) {
+                                        //And each position in each other cluster,
+                                        //make sure that this position is far away from i1
+                                        pos_t pos2 = all_seeds[read_num][clust2[i2]];
+                                        size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
+                                        pos_t rev2 = make_pos_t(get_id(pos2), 
+                                                         !is_rev(pos2),
+                                                         len2 - get_offset(pos2)-1); 
+
+                                        int64_t dist1 = dist_index.minDistance(pos1, pos2);
+                                        int64_t dist2 = dist_index.minDistance(pos1, rev2);
+                                        int64_t dist3 = dist_index.minDistance(rev1, pos2);
+                                        int64_t dist4 = dist_index.minDistance(rev1, rev2);
+                                        int64_t dist = MinimumDistanceIndex::minPos({dist1, 
+                                                           dist2, dist3, dist4});
+                                        if ( dist != -1 && dist <= read_lim) {
+                                            dist_index.printSelf();
+                                            graph.serialize_to_file("testGraph");
+                                            cerr << "These should have been in the same read cluster: " ;
+                                            cerr << pos1 << " and " << pos2 << endl;
+                                            cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl;
+                                            REQUIRE(false);
+                                        }
+                                        
                                     }
-                                    
                                 }
                             }
-                        }
-                        for (size_t i2 = 0 ; i2 < clust.size() ; i2++) {
-                            //For each position in the same cluster
-                            pos_t pos2 = seeds[clust[i2]];
-                            size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
-                            pos_t rev2 = make_pos_t(get_id(pos2), 
-                                                 !is_rev(pos2),
-                                                 len2 - get_offset(pos2)-1); 
-                            int64_t dist1 = dist_index.minDistance(pos1, pos2);
-                            int64_t dist2 = dist_index.minDistance(pos1, rev2);
-                            int64_t dist3 = dist_index.minDistance(rev1, pos2);
-                            int64_t dist4 = dist_index.minDistance(rev1, rev2);
-                            int64_t dist = MinimumDistanceIndex::minPos({dist1, 
-                                               dist2, dist3, dist4});
-                            if ( dist != -1 && dist <= read_lim) {
-                                new_clusters.union_groups(i1, i2);
-                            }
+                            for (size_t i2 = 0 ; i2 < clust.size() ; i2++) {
+                                //For each position in the same cluster
+                                pos_t pos2 = all_seeds[read_num][clust[i2]];
+                                size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
+                                pos_t rev2 = make_pos_t(get_id(pos2), 
+                                                     !is_rev(pos2),
+                                                     len2 - get_offset(pos2)-1); 
+                                int64_t dist1 = dist_index.minDistance(pos1, pos2);
+                                int64_t dist2 = dist_index.minDistance(pos1, rev2);
+                                int64_t dist3 = dist_index.minDistance(rev1, pos2);
+                                int64_t dist4 = dist_index.minDistance(rev1, rev2);
+                                int64_t dist = MinimumDistanceIndex::minPos({dist1, 
+                                                   dist2, dist3, dist4});
+                                if ( dist != -1 && dist <= read_lim) {
+                                    new_clusters.union_groups(i1, i2);
+                                }
 
+                            }
                         }
-                    }
-                    auto actual_clusters = new_clusters.all_groups();
-                    if (actual_clusters.size() != 1) {
-                                        dist_index.printSelf();
-                        graph.serialize_to_file("testGraph");
-                        cerr << "These should be different read clusters: " << endl;
-                        for (auto c : actual_clusters) {
-                            cerr << "cluster: " ; 
-                            for (size_t i1 : c) {
-                                cerr << seeds[clust[i1]] << " ";
+                        auto actual_clusters = new_clusters.all_groups();
+                        if (actual_clusters.size() != 1) {
+                                            dist_index.printSelf();
+                            graph.serialize_to_file("testGraph");
+                            cerr << "These should be different read clusters: " << endl;
+                            for (auto c : actual_clusters) {
+                                cerr << "cluster: " ; 
+                                for (size_t i1 : c) {
+                                    cerr << all_seeds[read_num][clust[i1]] << " ";
+                                }
+                                cerr << endl;
                             }
-                            cerr << endl;
                         }
+                        REQUIRE(actual_clusters.size() == 1);
                     }
-                    REQUIRE(actual_clusters.size() == 1);
                 }
                 for (size_t a = 0; a < fragment_clusters.size(); a++) {
                     // For each cluster -cluster this cluster to ensure that 
@@ -959,7 +982,7 @@ namespace unittest {
                     structures::UnionFind new_clusters (clust.size(), false);
 
                     for (size_t i1 = 0 ; i1 < clust.size() ; i1++) {
-                        pos_t pos1 = seeds[clust[i1]];
+                        pos_t pos1 = ordered_seeds[clust[i1]];
                         size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false));
                         pos_t rev1 = make_pos_t(get_id(pos1), 
                                             !is_rev(pos1),
@@ -972,7 +995,7 @@ namespace unittest {
                                 for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) {
                                     //And each position in each other cluster,
                                     //make sure that this position is far away from i1
-                                    pos_t pos2 = seeds[clust2[i2]];
+                                    pos_t pos2 = ordered_seeds[clust2[i2]];
                                     size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
                                     pos_t rev2 = make_pos_t(get_id(pos2), 
                                                      !is_rev(pos2),
@@ -998,7 +1021,7 @@ namespace unittest {
                         }
                         for (size_t i2 = 0 ; i2 < clust.size() ; i2++) {
                             //For each position in the same cluster
-                            pos_t pos2 = seeds[clust[i2]];
+                            pos_t pos2 = ordered_seeds[clust[i2]];
                             size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
                             pos_t rev2 = make_pos_t(get_id(pos2), 
                                                  !is_rev(pos2),
@@ -1023,7 +1046,7 @@ namespace unittest {
                         for (auto c : actual_clusters) {
                             cerr << "cluster: " ; 
                             for (size_t i1 : c) {
-                                cerr << seeds[clust[i1]] << " ";
+                                cerr << ordered_seeds[clust[i1]] << " ";
                             }
                             cerr << endl;
                         }

From e0108cb8c96d3832579c5d2cf98411a069a287c1 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Thu, 7 Nov 2019 08:43:30 -0800
Subject: [PATCH 28/79] Made debug code compile

---
 src/seed_clusterer.cpp          | 96 ++++++++++++++++++++-------------
 src/unittest/seed_clusterer.cpp | 40 ++++++--------
 2 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index ede84c05f36..01884bb2518 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -98,6 +98,7 @@ cerr << endl << "New cluster calculation:" << endl;
                 }
                 cerr << endl;
             }
+            cerr << endl;
         }
         vector<pos_t> ordered_seeds;
         for (size_t i = 0 ; i < tree_state.all_seeds->size() ; i++) {
@@ -305,11 +306,11 @@ cerr << endl << "New cluster calculation:" << endl;
             //seeds on this node must be in the same cluster
 
             for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
-                if (tree_state.node_to_seeds[read_num].size() > 0) {
-                    auto seed_range_start = std::lower_bound(
-                        tree_state.node_to_seeds[read_num].begin(),
-                        tree_state.node_to_seeds[read_num].end(),
-                        std::pair<id_t, size_t>(node_id, 0));
+                auto seed_range_start = std::lower_bound(
+                    tree_state.node_to_seeds[read_num].begin(),
+                    tree_state.node_to_seeds[read_num].end(),
+                    std::pair<id_t, size_t>(node_id, 0));
+                if (seed_range_start != tree_state.node_to_seeds[read_num].end()) {
 
                     size_t group_id = seed_range_start->second;
                     size_t fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
@@ -382,23 +383,25 @@ cerr << endl << "New cluster calculation:" << endl;
         vector<tuple<size_t,size_t, int64_t>> seed_offsets;
         for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
             //<index of read, index of seed, offset of seed> for all seeds
-            auto seed_range_start = std::lower_bound(
-                tree_state.node_to_seeds[read_num].begin(),
-                tree_state.node_to_seeds[read_num].end(),
-                std::pair<id_t, size_t>(node_id, 0));
-            for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
-                //For each seed, find its offset
-                pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
-                int64_t offset = is_rev(seed) ? node_length - get_offset(seed)
-                                                : get_offset(seed) + 1;
-
-                node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left);
-                node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right);
-                node_clusters.read_best_left[read_num] = min_not_minus_one(offset, node_clusters.read_best_left[read_num]);
-                node_clusters.read_best_right[read_num] = min_not_minus_one(node_length-offset+1, node_clusters.read_best_right[read_num]);
-
-                seed_offsets.emplace_back(read_num, iter->second, offset);
+                auto seed_range_start = std::lower_bound(
+                    tree_state.node_to_seeds[read_num].begin(),
+                    tree_state.node_to_seeds[read_num].end(),
+                    std::pair<id_t, size_t>(node_id, 0));
+            if (seed_range_start != tree_state.node_to_seeds[read_num].end()) {
+                for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
+                    //For each seed, find its offset
+                    pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
+                    int64_t offset = is_rev(seed) ? node_length - get_offset(seed)
+                                                    : get_offset(seed) + 1;
+
+                    node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left);
+                    node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right);
+                    node_clusters.read_best_left[read_num] = min_not_minus_one(offset, node_clusters.read_best_left[read_num]);
+                    node_clusters.read_best_right[read_num] = min_not_minus_one(node_length-offset+1, node_clusters.read_best_right[read_num]);
+
+                    seed_offsets.emplace_back(read_num, iter->second, offset);
 
+                }
             }
         }
         //Sort seeds by their position in the node
@@ -469,7 +472,9 @@ cerr << endl << "New cluster calculation:" << endl;
             }
         }
         for (size_t i = 0 ; i < read_last_cluster.size() ; i++) {
-            node_clusters.read_cluster_heads.emplace(i, read_last_cluster[i]);
+            if (read_last_cluster[i] != -1) {
+                node_clusters.read_cluster_heads.emplace(i, read_last_cluster[i]);
+            }
         }
 
 #ifdef DEBUG_CLUSTER
@@ -478,18 +483,16 @@ cerr << endl << "New cluster calculation:" << endl;
         bool got_left = false;
         bool got_right = false;
 
-        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
-            for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
-                pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
-                assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]);
-                assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]);
-                assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left);
-                assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right);
-                if (dists.first == node_clusters.fragment_best_left) {got_left = true;}
-                if (dists.second == node_clusters.fragment_best_right) {got_right = true;}
-                cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " <<
-                                                           dists.second << endl;
-            }
+        for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
+            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+            assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[c.first]);
+            assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[c.first]);
+            assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left);
+            assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right);
+            if (dists.first == node_clusters.fragment_best_left) {got_left = true;}
+            if (dists.second == node_clusters.fragment_best_right) {got_right = true;}
+            cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " <<
+                                                       dists.second << endl;
         }
         assert(got_left );
         assert(got_right);
@@ -580,6 +583,7 @@ cerr << endl << "New cluster calculation:" << endl;
                     fragment_combined_group = tree_state.fragment_union_find.find_group(new_group);
                 }
             }
+            cerr << endl;
             return;
         };
         //The clusters of the chain that are built from the snarl clusters
@@ -675,11 +679,14 @@ cerr << endl << "New cluster calculation:" << endl;
                 cerr << "\tread " << c.first << ",cluster " << c.second << " left: " << dists.first << " right : " << dists.second
                      << endl;
                 cerr << "\t\t";
+                bool has_seeds = false;
                 for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
                     if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
                         cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                        has_seeds = true;
                     }
                 }
+                assert(has_seeds);
                 cerr << endl;
             }
             cerr << endl;
@@ -823,7 +830,15 @@ cerr << "  Combining this cluster from the right" << endl;
                     } else {
                         //Cluster
                         tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second);
-                        combined_cluster[read_num] = tree_state.read_union_find[read_num].find_group(cluster_head.second);
+                        size_t new_group  = tree_state.read_union_find[read_num].find_group(cluster_head.second);
+
+                        if (new_group == cluster_head.second) {
+                            to_erase.emplace_back(read_num,combined_cluster[read_num]);
+                        } else {
+                            to_erase.push_back(cluster_head);
+                        }
+
+                        combined_cluster[read_num] = new_group;
                         combined_left[read_num] = min_not_minus_one(combined_left[read_num],
                                     snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left);
                         combined_right[read_num] = min_not_minus_one(combined_right[read_num],snarl_dists.second);
@@ -969,11 +984,14 @@ cerr << "  Combining this cluster from the right" << endl;
                 pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
                 cerr << "\t\tleft: " << dists.first << " right : " << dists.second << endl;
                 cerr << "\t\t\t";
+                bool has_seeds = false;
                 for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
                     if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
                         cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                        has_seeds = true;
                     }
                 }
+                assert (has_seeds);
                 cerr << endl;
             }
 #endif
@@ -1069,12 +1087,15 @@ cerr << "  Combining this cluster from the right" << endl;
         cerr << "best left : " << chain_clusters.fragment_best_left << " best right : "
              << chain_clusters.fragment_best_right << endl;
         for (pair<size_t, size_t> c : chain_clusters.read_cluster_heads) {
-            cerr << "\t";
+            cerr << "\tcluster " << c.first << ":" << c.second;
+            bool has_seeds = false;
             for (size_t x = 0 ; x < tree_state.all_seeds->size() ; x++) {
                 if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
                     cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                    has_seeds = true;
                 }
             }
+            assert(has_seeds);
             cerr << endl;
         }
         bool got_left = false;
@@ -1456,11 +1477,14 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
             cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : "
                  << dists.second << endl;
             cerr << "\t\t";
+            bool has_seeds = false;
             for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
                 if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
                     cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                    has_seeds = true;
                 }
             }
+            assert(has_seeds);
             cerr << endl;
         }
         assert(got_left);
diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp
index 25d7a287e37..7afbd2a851b 100644
--- a/src/unittest/seed_clusterer.cpp
+++ b/src/unittest/seed_clusterer.cpp
@@ -130,7 +130,7 @@ namespace unittest {
             }
             vector<pos_t> seeds1;
             for (id_t n : seed_nodes1) {
-                seeds.push_back(make_pos_t(n, false, 0));
+                seeds1.push_back(make_pos_t(n, false, 0));
             }
             vector<vector<pos_t>> all_seeds;
             all_seeds.push_back(seeds);
@@ -142,41 +142,35 @@ namespace unittest {
             vector<vector<vector<size_t>>> read_clusters = std::get<0>(paired_clusters);
             //Should be [[[0,1,2]],[[3,4,5,6]]] 
             vector<vector<size_t>> fragment_clusters = std::get<1>(paired_clusters);
-            vector<hash_set<size_t>> cluster_sets;
+            vector<hash_set<size_t>> read_set_1;
             for (vector<size_t> v : read_clusters[0]) {
                 hash_set<size_t> h;
                 for (size_t s : v) {
                     h.insert(s);
                 }
-                cluster_sets.push_back(h);
+                read_set_1.push_back(h);
             }
+            vector<hash_set<size_t>> read_set_2;
             for (vector<size_t> v : read_clusters[1]) {
                 hash_set<size_t> h;
                 for (size_t s : v) {
                     h.insert(s);
                 }
-                cluster_sets.push_back(h);
+                read_set_2.push_back(h);
             }
             REQUIRE( read_clusters.size() == 2);
-            REQUIRE( (read_clusters[0].size() == 3 || read_clusters[1].size() == 3));
-            REQUIRE( (read_clusters[0].size() == 4 || read_clusters[1].size() == 4));
+            REQUIRE( (read_clusters[0][0].size() == 3 || read_clusters[1][0].size() == 3));
+            REQUIRE( (read_clusters[0][0].size() == 4 || read_clusters[1][0].size() == 4));
             REQUIRE( fragment_clusters.size() == 1);
-            REQUIRE (((cluster_sets[0].count(0) == 1 &&
-                       cluster_sets[0].count(1) == 1 &&
-                       cluster_sets[0].count(2) == 1 &&
-                       cluster_sets[1].count(3) == 1 &&
-                       cluster_sets[1].count(4) == 1 &&
-                       cluster_sets[1].count(5) == 1 &&
-                       cluster_sets[1].count(6) == 1  ) ||
-
-                     ( cluster_sets[1].count(0) == 1 &&
-                       cluster_sets[1].count(1) == 1 &&
-                       cluster_sets[1].count(2) == 1 &&
-                       cluster_sets[0].count(3) == 1 &&
-                       cluster_sets[0].count(4) == 1 &&
-                       cluster_sets[0].count(5) == 1 &&
-                       cluster_sets[0].count(6) == 1  )));
-
+            REQUIRE ( read_set_1.size() == 1);
+            REQUIRE (( read_set_1[0].count(0) == 1 &&
+                       read_set_1[0].count(1) == 1 &&
+                       read_set_1[0].count(2) == 1));
+            REQUIRE (read_set_2.size() == 1);
+            REQUIRE (( read_set_2[0].count(0) == 1 &&
+                       read_set_2[0].count(1) == 1 &&
+                       read_set_2[0].count(2) == 1 &&
+                       read_set_2[0].count(3) == 1 ));
         }
         SECTION( "Two fragment clusters" ) {
  
@@ -190,7 +184,7 @@ namespace unittest {
             }
             vector<pos_t> seeds1;
             for (id_t n : seed_nodes1) {
-                seeds.push_back(make_pos_t(n, false, 0));
+                seeds1.push_back(make_pos_t(n, false, 0));
             }
             vector<vector<pos_t>> all_seeds;
             all_seeds.push_back(seeds);

From e23f732d3495156c9f9e54d71d3ab94d40f773d9 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Thu, 7 Nov 2019 16:02:30 -0800
Subject: [PATCH 29/79] Clusterer passes unit tests

---
 src/seed_clusterer.cpp          | 354 +++++++++++++++++++-------------
 src/seed_clusterer.hpp          |   6 +-
 src/unittest/seed_clusterer.cpp | 149 +++++++-------
 3 files changed, 292 insertions(+), 217 deletions(-)

diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index 01884bb2518..31fe6e6b7b7 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -26,7 +26,7 @@ namespace vg {
          * Returns a vector of cluster assignments
          */
 #ifdef DEBUG_CLUSTER
-cerr << endl << "New cluster calculation:" << endl;
+cerr << endl << endl << endl << endl << "New cluster calculation:" << endl;
 #endif
         if (fragment_distance_limit != 0 &&
             fragment_distance_limit < read_distance_limit) {
@@ -49,6 +49,7 @@ cerr << endl << "New cluster calculation:" << endl;
         for (auto& v : all_seeds) seed_count+= v.size();
         TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit, seed_count);
 
+
         //Populate tree_state.node_to_seeds (mapping each node to the seeds it
         //contains) and snarl_to_nodes_by_level
         get_nodes(tree_state, snarl_to_nodes_by_level);
@@ -74,6 +75,12 @@ cerr << endl << "New cluster calculation:" << endl;
                                        move(snarl_to_nodes_by_level[depth - 1]);
             }
 
+#ifdef DEBUG_CLUSTER
+assert(tree_state.read_index_offsets[0] == 0);
+for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
+    assert (tree_state.read_index_offsets[i] + tree_state.all_seeds->at(i).size() == tree_state.read_index_offsets[i+1]);
+}
+#endif
             //Cluster all the snarls at this depth
             //Also records which snarls are in chains and the parents of these
             //snarls in tree_state.parent_snarl_to_node
@@ -131,7 +138,7 @@ cerr << endl << "New cluster calculation:" << endl;
 
     void SnarlSeedClusterer::get_nodes( TreeState& tree_state,
               vector<hash_map<size_t,vector<pair<NetgraphNode, NodeClusters>>>>&
-                                                               snarl_to_nodes) const {
+                                                               snarl_to_nodes_by_level) const {
 
         // Assign each seed to a node.
         for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ 
@@ -145,17 +152,16 @@ cerr << endl << "New cluster calculation:" << endl;
         }
 
         // Assign each node to a snarl.
-        id_t prev_node = -1;
+        hash_set<id_t> seen_nodes;
         for (auto& read_node :tree_state.node_to_seeds) {
             for (auto& mapping : read_node) {
-                if (mapping.first == prev_node) {
-                    continue;
+                if (seen_nodes.count(mapping.first) < 1) {
+                    seen_nodes.insert( mapping.first);
+                    size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first);
+                    size_t depth = dist_index.snarl_indexes[snarl_i].depth;
+                    snarl_to_nodes_by_level[depth][snarl_i].emplace_back(
+                             NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size()));
                 }
-                prev_node = mapping.first;
-                size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first);
-                size_t depth = dist_index.snarl_indexes[snarl_i].depth;
-                snarl_to_nodes[depth][snarl_i].emplace_back(
-                         NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size()));
             }
         }
     }
@@ -305,15 +311,16 @@ cerr << endl << "New cluster calculation:" << endl;
             //If the limit is greater than the node length, then all the
             //seeds on this node must be in the same cluster
 
+            size_t fragment_group_id = -1;
             for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
                 auto seed_range_start = std::lower_bound(
                     tree_state.node_to_seeds[read_num].begin(),
                     tree_state.node_to_seeds[read_num].end(),
                     std::pair<id_t, size_t>(node_id, 0));
-                if (seed_range_start != tree_state.node_to_seeds[read_num].end()) {
+                if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) {
 
                     size_t group_id = seed_range_start->second;
-                    size_t fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
+                    if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
 
                     for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
                         //For each seed on this node, add it to the cluster
@@ -350,33 +357,48 @@ cerr << endl << "New cluster calculation:" << endl;
 
                     if (tree_state.fragment_distance_limit != 0) {
                         fragment_group_id = tree_state.fragment_union_find.find_group(fragment_group_id);
-                        tree_state.fragment_cluster_dists[fragment_group_id] = make_pair(node_clusters.fragment_best_left,
-                                                            node_clusters.fragment_best_right);
                     }
+                }
+            }
 #ifdef DEBUG_CLUSTER
-                    assert (group_id == tree_state.read_union_find[read_num].find_group(group_id));
-                    cerr << "Found single cluster on node " << node_id << "with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl;
-                    bool got_left = false;
-                    bool got_right = false;
-                    for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
+            cerr << "Found single cluster on node " << node_id << " with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl;
+
+            bool got_left = false;
+            bool got_right = false;
+            for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+                cerr << " for read num " << read_num << " best left: " << node_clusters.read_best_left[read_num] << " best right: " << node_clusters.read_best_right[read_num] << endl;
+                bool got_read_left=false;
+                bool got_read_right = false;
+                for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
+                    if (c.first == read_num) {
                         pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+                        cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
+                        bool has_seeds = false;
+                        for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                            if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                                cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                                has_seeds = true;
+                            }
+                        }
                         assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]);
                         assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]);
                         assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left);
                         assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right);
                         if (dists.first == node_clusters.fragment_best_left) {got_left = true;}
                         if (dists.second == node_clusters.fragment_best_right) {got_right = true;}
-                        //if (dists.first == node_clusters.read_best_left[read_num]) {got_all_left[read_num] = true;}
-                        //if (dists.second == node_clusters.read_best_right[read_num]) {got_all_right[read_num] = true;}
-                        cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " <<
-                                                                   dists.second << endl;
+                        if (dists.first == node_clusters.read_best_left[read_num]) {got_read_left = true;}
+                        if (dists.second == node_clusters.read_best_right[read_num]) {got_read_right = true;}
+                        cerr << endl;
+                        assert(has_seeds);
                     }
-                    assert(got_left);
-                    assert(got_right);
-
-#endif
                 }
+                assert(got_read_left || node_clusters.read_best_left[read_num] == -1);
+                assert(got_read_right || node_clusters.read_best_right[read_num] == -1);
             }
+            assert(got_left);
+            assert(got_right);
+
+#endif
             return node_clusters;
         }
 
@@ -437,10 +459,8 @@ cerr << endl << "New cluster calculation:" << endl;
                 if (tree_state.fragment_distance_limit != 0) {
                     //If we are also clustering paired end reads by fragment distance,
                     //cluster these together
-                    int64_t prev_dist_left = tree_state.fragment_cluster_dists[fragment_last_cluster].first;
                     tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster);
                     fragment_last_cluster = tree_state.fragment_union_find.find_group(std::get<1>(s)+tree_state.read_index_offsets[read_num]);
-                    tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1);
                     fragment_last_offset = std::get<2>(s);
                 }
             } else {
@@ -454,19 +474,16 @@ cerr << endl << "New cluster calculation:" << endl;
                         make_pair(read_last_offset[read_num], node_length - read_last_offset[read_num] + 1);
                 if (tree_state.fragment_distance_limit != 0) {
                     if (fragment_last_offset != -1 &&
-                        abs(read_last_offset[read_num] - fragment_last_offset) <= tree_state.fragment_distance_limit) {
+                        abs(std::get<2>(s) - fragment_last_offset) <= tree_state.fragment_distance_limit) {
                         //If this is a new read cluster but the same fragment cluster
-                        int64_t prev_dist_left = tree_state.fragment_cluster_dists[fragment_last_cluster].first;
                         tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster);
                         fragment_last_cluster = tree_state.fragment_union_find.find_group(fragment_last_cluster);
-                        tree_state.fragment_cluster_dists[fragment_last_cluster] = make_pair(prev_dist_left, node_length-std::get<2>(s)+1);
+                        fragment_last_offset = std::get<2>(s);
 
                     } else {
                         //If this is a new fragment cluster as well
                         fragment_last_cluster = std::get<1>(s)+tree_state.read_index_offsets[read_num];
                         fragment_last_offset = std::get<2>(s);
-                        tree_state.fragment_cluster_dists[fragment_last_cluster] = 
-                                make_pair(fragment_last_offset, node_length-fragment_last_offset+1);
                     }
                 }
             }
@@ -480,21 +497,40 @@ cerr << endl << "New cluster calculation:" << endl;
 #ifdef DEBUG_CLUSTER
 
         cerr << "Found read clusters on node " << node_id << endl;
+
         bool got_left = false;
         bool got_right = false;
-
-        for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
-            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
-            assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[c.first]);
-            assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[c.first]);
-            assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left);
-            assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right);
-            if (dists.first == node_clusters.fragment_best_left) {got_left = true;}
-            if (dists.second == node_clusters.fragment_best_right) {got_right = true;}
-            cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " <<
-                                                       dists.second << endl;
+        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+            cerr << " for read num " << read_num << " best left: " << node_clusters.read_best_left[read_num] << " best right: " << node_clusters.read_best_right[read_num] << endl;
+            bool got_read_left=false;
+            bool got_read_right = false;
+            for (pair<size_t,size_t> c : node_clusters.read_cluster_heads) {
+                if (c.first == read_num) {
+                    pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+                    cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
+                    bool has_seeds = false;
+                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                        if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                            cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                            has_seeds = true;
+                        }
+                    }
+                    assert(dists.first == -1 || dists.first >= node_clusters.read_best_left[read_num]);
+                    assert(dists.second == -1 || dists.second >= node_clusters.read_best_right[read_num]);
+                    assert(dists.first == -1 || dists.first >= node_clusters.fragment_best_left);
+                    assert(dists.second == -1 || dists.second >= node_clusters.fragment_best_right);
+                    if (dists.first == node_clusters.fragment_best_left) {got_left = true;}
+                    if (dists.second == node_clusters.fragment_best_right) {got_right = true;}
+                    if (dists.first == node_clusters.read_best_left[read_num]) {got_read_left = true;}
+                    if (dists.second == node_clusters.read_best_right[read_num]) {got_read_right = true;}
+                    cerr << endl;
+                    assert(has_seeds);
+                }
+            }
+            assert(got_read_left || node_clusters.read_best_left[read_num] == -1);
+            assert(got_read_right || node_clusters.read_best_right[read_num] == -1);
         }
-        assert(got_left );
+        assert(got_left);
         assert(got_right);
         for (pair<size_t, size_t> group_id : node_clusters.read_cluster_heads) {
             assert (group_id.second == tree_state.read_union_find[group_id.first].find_group(group_id.second));
@@ -531,7 +567,7 @@ cerr << endl << "New cluster calculation:" << endl;
             //Used when two clusters in the same snarl can be combined by
             //looping in the chain
 
-            if (read_dist <= tree_state.read_distance_limit) {
+            if (read_dist != -1 && read_dist <= tree_state.read_distance_limit) {
                 if (combined_group == -1) {
                     combined_group = new_group;
                 } else {
@@ -576,12 +612,12 @@ cerr << endl << "New cluster calculation:" << endl;
                         fragment_dist <= tree_state.fragment_distance_limit) {
                 //If these aren't in the same read cluster but are in
                 //the same fragment cluster
-                if (fragment_combined_group == -1) {
-                    fragment_combined_group = new_group;
-                } else {
-                    tree_state.fragment_union_find.union_groups(new_group, fragment_combined_group);
-                    fragment_combined_group = tree_state.fragment_union_find.find_group(new_group);
+                if (fragment_combined_group != -1) {
+                    tree_state.fragment_union_find.union_groups(fragment_combined_group,
+                                                        new_group + tree_state.read_index_offsets[read_num]);
                 }
+                fragment_combined_group = tree_state.fragment_union_find.find_group(
+                                                     new_group + tree_state.read_index_offsets[read_num]);
             }
             cerr << endl;
             return;
@@ -765,17 +801,18 @@ cerr << "  (Possibly) updating looping distance to right of snarl cluster " << r
 #endif
 
 
-                    if (snarl_clusters.read_best_left[read_num] != -1 && snarl_dists.first != -1 ) {
+                    if (snarl_clusters.fragment_best_left!= -1 && snarl_dists.first != -1 ) {
                         //If this cluster can be combined with another cluster
                         //from the left
 
 #ifdef DEBUG_CLUSTER
 cerr << "  Combining this cluster from the left " ;
 #endif
+                        int64_t read_dist =  snarl_clusters.read_best_left[read_num] == -1 ? -1 :  
+                                     snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1;
                         combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], fragment_snarl_cluster_left,
                                      to_erase, snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1,
-                                     snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1,
-                                     snarl_dists, read_num);
+                                     read_dist, snarl_dists, read_num);
                     }
 
                 }
@@ -798,18 +835,19 @@ cerr << "Updating looping distance to left of snarl cluster " << read_num << ":"
 #endif
                     }
 
-                    if (snarl_clusters.read_best_right[read_num] != -1 && snarl_dists.second != -1 ) {
+                    if (snarl_clusters.fragment_best_right != -1 && snarl_dists.second != -1 ) {
                         //If this cluster can be combined with another cluster
                         //from the right
 
 #ifdef DEBUG_CLUSTER
 cerr << "  Combining this cluster from the right" << endl;
 #endif
+                        int64_t read_dist = snarl_clusters.read_best_right[read_num] == -1 ? -1 :
+                            snarl_clusters.read_best_right[read_num] + snarl_dists.second  + loop_dist_end - end_length - 1;
                         combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num],
                              fragment_snarl_cluster_right, to_erase,
                             snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1,
-                            snarl_clusters.read_best_right[read_num] + snarl_dists.second  + loop_dist_end - end_length - 1,
-                            snarl_dists, read_num);
+                            read_dist, snarl_dists, read_num);
                     }
                 }
 
@@ -855,7 +893,7 @@ cerr << "  Combining this cluster from the right" << endl;
                     //If the snarl cluster does not get combined with any of
                     //the existing chain clusters, then it becomes a new chain cluster
                     if (tree_state.fragment_distance_limit != 0 && fragment_chain_right != -1 && snarl_dists.first != -1 &&
-                           snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.read_distance_limit) {
+                           snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.fragment_distance_limit) {
                         //Cluster in the same fragment but not the same read
                         if (fragment_combined_cluster != -1) {
                             //Also cluster by fragment
@@ -1045,7 +1083,10 @@ cerr << "  Combining this cluster from the right" << endl;
                     } else {
                         tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second);
                         if (tree_state.fragment_distance_limit != 0) {
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second + tree_state.all_seeds->size());
+                            if (fragment_combined_cluster != -1) {
+                                tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]);
+                            }
+                            fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
                         }
                         size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second);
                         if (new_group == cluster_head.second) {
@@ -1087,35 +1128,47 @@ cerr << "  Combining this cluster from the right" << endl;
         cerr << "best left : " << chain_clusters.fragment_best_left << " best right : "
              << chain_clusters.fragment_best_right << endl;
         for (pair<size_t, size_t> c : chain_clusters.read_cluster_heads) {
-            cerr << "\tcluster " << c.first << ":" << c.second;
-            bool has_seeds = false;
-            for (size_t x = 0 ; x < tree_state.all_seeds->size() ; x++) {
-                if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                    cerr << tree_state.all_seeds->at(c.first)[x] << " ";
-                    has_seeds = true;
-                }
-            }
-            assert(has_seeds);
-            cerr << endl;
         }
         bool got_left = false;
         bool got_right = false;
-        for (pair<size_t, size_t> c : chain_clusters.read_cluster_heads) {
-            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
-            if (!chain_index.is_looping_chain){
-                assert(dists.first == -1 || dists.first >= chain_clusters.fragment_best_left);
-                assert(dists.second == -1 || dists.second >= chain_clusters.fragment_best_right);
-                assert(dists.first == -1 || dists.first >= chain_clusters.read_best_left[c.first]);
-                assert(dists.second == -1 || dists.second >= chain_clusters.read_best_right[c.first]);
+        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+            cerr << " for read num " << read_num << " best left: " << chain_clusters.read_best_left[read_num] << " best right: " << chain_clusters.read_best_right[read_num] << endl;
+            bool got_read_left=false;
+            bool got_read_right = false;
+            bool any_clusters = false;
+            for (pair<size_t,size_t> c : chain_clusters.read_cluster_heads) {
+                if (c.first == read_num) {
+                    any_clusters = true;
+                    pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+                    cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
+                    bool has_seeds = false;
+                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                        if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                            cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                            has_seeds = true;
+                        }
+                    }
+                    assert(dists.first == -1 || dists.first >= chain_clusters.read_best_left[read_num]);
+                    assert(dists.second == -1 || dists.second >= chain_clusters.read_best_right[read_num]);
+                    assert(dists.first == -1 || dists.first >= chain_clusters.fragment_best_left);
+                    assert(dists.second == -1 || dists.second >= chain_clusters.fragment_best_right);
+                    if (dists.first == chain_clusters.fragment_best_left) {got_left = true;}
+                    if (dists.second == chain_clusters.fragment_best_right) {got_right = true;}
+                    if (dists.first == chain_clusters.read_best_left[read_num]) {got_read_left = true;}
+                    if (dists.second == chain_clusters.read_best_right[read_num]) {got_read_right = true;}
+                    cerr << endl;
+                    assert(has_seeds);
+                }
+            }
+            if (!chain_index.is_looping_chain) {
+                assert(!any_clusters || got_read_left || chain_clusters.read_best_left[read_num] > tree_state.read_distance_limit || chain_clusters.read_best_left[read_num] == -1);
+                assert(!any_clusters || got_read_right || chain_clusters.read_best_right[read_num] > tree_state.read_distance_limit || chain_clusters.read_best_right[read_num] == -1);
             }
-            if (dists.first == chain_clusters.fragment_best_left) {got_left = true;}
-            if (dists.second == chain_clusters.fragment_best_right) {got_right = true;}
-            cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : "
-                 << dists.second << endl;
         }
+
         if (!chain_index.is_looping_chain) {
-            assert(got_left);
-            assert(got_right);
+            assert(got_left || chain_clusters.fragment_best_left > tree_state.fragment_distance_limit);
+            assert(got_right ||chain_clusters.fragment_best_right > tree_state.fragment_distance_limit );
         }
         for (pair<size_t, size_t> group_id : chain_clusters.read_cluster_heads) {
 
@@ -1143,14 +1196,13 @@ cerr << "  Combining this cluster from the right" << endl;
         NodeClusters snarl_clusters(tree_state.all_seeds->size());
 
         auto combine_clusters = [&] (size_t& new_group, size_t& combined_group,
-                                    size_t& fragment_combined_group, int64_t read_dist,
-                                    int64_t fragment_dist,
-                                    pair<int64_t, int64_t>& end_dists, size_t read_num){
+                                    size_t& fragment_combined_group,
+                                    int64_t fragment_dist, int64_t read_dist, size_t read_num){
             //Helper function to compare and combine clusters in two nodes of the same snarl
             //If the distance between two clusters is small enough, then combine them
             //for the read clusters and, if applicable, for the fragment clusters
             //Updates the distances stored for the read clusters
-            if (read_dist <= tree_state.read_distance_limit) {
+            if (read_dist != -1 && read_dist <= tree_state.read_distance_limit) {
                 //If the clusters are close enough to combine in the read
                 if (tree_state.fragment_distance_limit != 0) {
                     if (fragment_combined_group != -1) {
@@ -1160,6 +1212,8 @@ cerr << "  Combining this cluster from the right" << endl;
                     }
                     fragment_combined_group = tree_state.fragment_union_find.find_group(new_group+tree_state.read_index_offsets[read_num]);
                 }
+                pair<int64_t, int64_t>& end_dists = tree_state.read_cluster_dists[read_num][new_group];
+
                 if (combined_group == -1) {
                     snarl_clusters.read_cluster_heads.emplace(read_num,new_group);
                     tree_state.read_cluster_dists[read_num][new_group] = end_dists;
@@ -1192,13 +1246,12 @@ cerr << "  Combining this cluster from the right" << endl;
                   && fragment_dist <= tree_state.fragment_distance_limit) {
 
                 //Same fragment
-                if (fragment_combined_group == -1) {
-                    fragment_combined_group = new_group;
-                } else {
-                    tree_state.fragment_union_find.union_groups(
-                            new_group + tree_state.read_index_offsets[read_num], fragment_combined_group);
-                    fragment_combined_group = tree_state.fragment_union_find.find_group(new_group);
+                if (fragment_combined_group != -1) {
+                    //Also combine fragment clusters
+                    tree_state.fragment_union_find.union_groups(new_group+tree_state.read_index_offsets[read_num], 
+                                                                fragment_combined_group);
                 }
+                fragment_combined_group = tree_state.fragment_union_find.find_group(new_group+tree_state.read_index_offsets[read_num]);
             }
             return;
         };
@@ -1250,7 +1303,7 @@ cerr << "  Combining this cluster from the right" << endl;
             cerr << "Node rank is " << node_rank << " fwd, " << rev_rank
                  << " rev of " << snarl_index.num_nodes * 2 << endl;
             cerr << "Clusters at this child:" << endl;
-            for (pair<size_t, size_t> c : child_nodes[i].second.read_cluster_heads) {
+            for (pair<size_t, size_t> c : curr_child_clusters.read_cluster_heads) {
                 cerr << "\tdist left: " << tree_state.read_cluster_dists[c.first][c.second].first
                 << " dist right: " << tree_state.read_cluster_dists[c.first][c.second].second << endl;
                 cerr << "\t\t";
@@ -1369,8 +1422,6 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                         pair<size_t, size_t> child_cluster_head = children_i[c_i];
                         size_t read_num = child_cluster_head.first;
                         size_t c_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second);
-
-                        pair<int64_t, int64_t> new_dists = tree_state.read_cluster_dists[read_num][c_group];
                         pair<int64_t, int64_t> dists_c = old_dists[child_cluster_head];
 
 
@@ -1378,33 +1429,37 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                                  && other_node_clusters.fragment_best_left != -1 ) {
                             //If cluster child_cluster_head can be combined with clusters in j
                             //from the left of both of them
+                            int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 :
+                                  dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1;
                             combine_clusters(c_group, group_l_l[read_num], fragment_group_l_l,
                                   dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1,
-                                  dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1,
-                                  new_dists, read_num);
+                                  read_dist,  read_num);
                         }
 
                         if (dist_l_r != -1 && dists_c.first != -1
                             && other_node_clusters.fragment_best_right != -1 ) {
                             //If it can be combined from the left to the right of j
+                            int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 :
+                                 dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1;
                             combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r,
                                  dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1,
-                                 dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1,
-                                 new_dists, read_num);
+                                 read_dist, read_num);
                         }
                         if (dist_r_l != -1 && dists_c.second != -1
                             && other_node_clusters.fragment_best_left != -1 ) {
+                            int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 :
+                                dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1;
                             combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l,
                                 dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1,
-                                dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1, 
-                                new_dists, read_num);
+                                read_dist,  read_num);
                         }
                         if (dist_r_r != -1 && dists_c.second != -1
                             && other_node_clusters.fragment_best_right != -1 ) {
+                            int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 :
+                                dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1;
                             combine_clusters(c_group, group_r_r[read_num], fragment_group_r_r,
                                 dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1,
-                                dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1,
-                                new_dists, read_num);
+                                read_dist, read_num);
                         }
 
                     }
@@ -1422,42 +1477,46 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                         //snarl_cluster heads
                         pair<size_t,size_t> child_cluster_head = children_j[k_i];
                         size_t read_num = child_cluster_head.first;
-                        pair<int64_t, int64_t>& dist_bounds_k = old_dists[child_cluster_head];
+                        pair<int64_t, int64_t>& dists_k = old_dists[child_cluster_head];
                         size_t k_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second);
-                        pair<int64_t, int64_t> dists_k = tree_state.read_cluster_dists[read_num][k_group];
 
 
-                        if (dist_l_l != -1 && curr_child_clusters.read_best_left[read_num] != -1
-                           && dist_bounds_k.first != -1 ){
+                        if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1
+                           && dists_k.first != -1 ){
 
+                            int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 :
+                                dist_l_l + curr_child_clusters.read_best_left[read_num] + dists_k.first-1;
                             combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l,
-                                dist_l_l + curr_child_clusters.fragment_best_left + dist_bounds_k.first-1,
-                                dist_l_l + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.first-1, 
-                                dists_k, read_num);
+                                dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1,
+                                read_dist, read_num);
                         }
-                        if (dist_l_r != -1 && curr_child_clusters.read_best_left[read_num] != -1
-                             && dist_bounds_k.second != -1  ) {
+                        if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1
+                             && dists_k.second != -1  ) {
+
+                            int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 :
+                               dist_l_r + curr_child_clusters.read_best_left[read_num] + dists_k.second-1;
 
                             combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r,
-                               dist_l_r + curr_child_clusters.fragment_best_left + dist_bounds_k.second-1, 
-                               dist_l_r + curr_child_clusters.read_best_left[read_num] + dist_bounds_k.second-1, 
-                               dists_k, read_num);
+                               dist_l_r + curr_child_clusters.fragment_best_left + dists_k.second-1, 
+                               read_dist, read_num);
                         }
-                        if (dist_r_l != -1 && curr_child_clusters.read_best_right[read_num] != -1
-                            && dist_bounds_k.first != -1  ) {
+                        if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1
+                            && dists_k.first != -1  ) {
 
+                            int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 :
+                                dist_r_l + curr_child_clusters.read_best_right[read_num] + dists_k.first-1;
                             combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l,
-                                dist_r_l + curr_child_clusters.fragment_best_right + dist_bounds_k.first-1,
-                                dist_r_l + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.first-1,
-                                dists_k,read_num);
+                                dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1,
+                                read_dist, read_num);
                         }
-                        if (dist_r_r != -1 && curr_child_clusters.read_best_right[read_num] != -1
-                           && dist_bounds_k.second != -1 ) {
+                        if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1
+                           && dists_k.second != -1 ) {
 
+                            int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 :
+                               dist_r_r + curr_child_clusters.read_best_right[read_num] + dists_k.second-1;
                             combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r,
-                               dist_r_r + curr_child_clusters.fragment_best_right + dist_bounds_k.second-1, 
-                               dist_r_r + curr_child_clusters.read_best_right[read_num] + dist_bounds_k.second-1, 
-                               dists_k, read_num);
+                               dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1, 
+                               read_dist, read_num);
                         }
                     }
                 }
@@ -1470,22 +1529,37 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
              << snarl_clusters.fragment_best_right << endl;
         bool got_left = false;
         bool got_right = false;
-        for (pair<size_t, size_t> c : snarl_clusters.read_cluster_heads) {
-            pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
-            if (dists.first == snarl_clusters.fragment_best_left) {got_left = true;}
-            if (dists.second == snarl_clusters.fragment_best_right) {got_right = true;}
-            cerr << "\t" << c.first << ":" << c.second << ": left: " << dists.first << " right : "
-                 << dists.second << endl;
-            cerr << "\t\t";
-            bool has_seeds = false;
-            for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
-                if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                    cerr << tree_state.all_seeds->at(c.first)[x] << " ";
-                    has_seeds = true;
+        for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
+            cerr << " for read num " << read_num << " best left: " << snarl_clusters.read_best_left[read_num] << " best right: " << snarl_clusters.read_best_right[read_num] << endl;
+            bool got_read_left=false;
+            bool got_read_right = false;
+            bool any_clusters = false;
+            for (pair<size_t,size_t> c : snarl_clusters.read_cluster_heads) {
+                if (c.first == read_num) {
+                    any_clusters = true;
+                    pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
+                    cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
+                    bool has_seeds = false;
+                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                        if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
+                            cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                            has_seeds = true;
+                        }
+                    }
+                    assert(dists.first == -1 || dists.first >= snarl_clusters.read_best_left[read_num]);
+                    assert(dists.second == -1 || dists.second >= snarl_clusters.read_best_right[read_num]);
+                    assert(dists.first == -1 || dists.first >= snarl_clusters.fragment_best_left);
+                    assert(dists.second == -1 || dists.second >= snarl_clusters.fragment_best_right);
+                    if (dists.first == snarl_clusters.fragment_best_left) {got_left = true;}
+                    if (dists.second == snarl_clusters.fragment_best_right) {got_right = true;}
+                    if (dists.first == snarl_clusters.read_best_left[read_num]) {got_read_left = true;}
+                    if (dists.second == snarl_clusters.read_best_right[read_num]) {got_read_right = true;}
+                    cerr << endl;
+                    assert(has_seeds);
                 }
             }
-            assert(has_seeds);
-            cerr << endl;
+            assert(!any_clusters ||got_read_left ||  snarl_clusters.read_best_left[read_num] == -1);
+            assert(!any_clusters ||got_read_right ||  snarl_clusters.read_best_right[read_num] == -1);
         }
         assert(got_left);
         assert(got_right);
diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp
index d90410798ae..f95f0261ae9 100644
--- a/src/seed_clusterer.hpp
+++ b/src/seed_clusterer.hpp
@@ -157,7 +157,6 @@ class SnarlSeedClusterer {
             //These values are only relevant for seeds that represent a cluster
             //in union_find_reads
             vector<vector<pair<int64_t, int64_t>>> read_cluster_dists;
-            vector<pair<int64_t, int64_t>> fragment_cluster_dists;
 
 
 
@@ -196,12 +195,11 @@ class SnarlSeedClusterer {
             TreeState (vector<vector<pos_t>>* all_seeds, int64_t read_distance_limit, 
                        int64_t fragment_distance_limit, size_t seed_count) :
                 all_seeds(all_seeds),
-                fragment_cluster_dists(all_seeds->size(), make_pair(-1, -1)),
                 read_distance_limit(read_distance_limit),
                 fragment_distance_limit(fragment_distance_limit),
-                fragment_union_find (seed_count, false) {
+                fragment_union_find (seed_count, false),
+                read_index_offsets(1,0){
 
-                read_index_offsets.push_back(0);
                 size_t total_seeds = 0;
                 for (vector<pos_t>& v : *all_seeds) {
                     total_seeds += v.size();
diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp
index 7afbd2a851b..8e107e8e99d 100644
--- a/src/unittest/seed_clusterer.cpp
+++ b/src/unittest/seed_clusterer.cpp
@@ -472,7 +472,7 @@ namespace unittest {
             vector<vector<size_t>>fragment_clusters = std::get<1>(paired_clusters);
 
             REQUIRE( read_clusters.size() == 1);
-            REQUIRE( read_clusters[0].size() == 1);
+            REQUIRE( read_clusters[0].size() == 4);
             REQUIRE( fragment_clusters.size() == 4);
 
             //New fragment clusters
@@ -825,7 +825,7 @@ namespace unittest {
 
     TEST_CASE("Random graphs", "[cluster]"){
 
-        for (int i = 0; i < 0; i++) {
+        for (int i = 0; i < 1000; i++) {
             // For each random graph
             VG graph;
             random_graph(1000, 20, 100, &graph);
@@ -886,86 +886,89 @@ namespace unittest {
                     ordered_seeds.push_back(s);
                 }
                 
-                for (size_t read_num = 0 ; read_num <= 2 ; read_num ++) {
+                for (size_t read_num = 0 ; read_num < 2 ; read_num ++) {
                     auto& one_read_clusters = read_clusters[read_num];
-                    for (size_t a = 0; a < one_read_clusters.size(); a++) {
-                        // For each cluster -cluster this cluster to ensure that 
-                        // there is only one
-                        vector<size_t> clust = one_read_clusters[a];
-                        
-                        structures::UnionFind new_clusters (clust.size(), false);
-
-                        for (size_t i1 = 0 ; i1 < clust.size() ; i1++) {
-                            pos_t pos1 = all_seeds[read_num][clust[i1]];
-                            size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false));
-                            pos_t rev1 = make_pos_t(get_id(pos1), 
-                                                !is_rev(pos1),
-                                                len1 - get_offset(pos1)-1); 
-
-                            for (size_t b = 0 ; b < one_read_clusters.size() ; b++) {
-                                if (b != a) {
-                                    //For each other cluster
-                                    vector<size_t> clust2 = one_read_clusters[b];
-                                    for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) {
-                                        //And each position in each other cluster,
-                                        //make sure that this position is far away from i1
-                                        pos_t pos2 = all_seeds[read_num][clust2[i2]];
-                                        size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
-                                        pos_t rev2 = make_pos_t(get_id(pos2), 
-                                                         !is_rev(pos2),
-                                                         len2 - get_offset(pos2)-1); 
-
-                                        int64_t dist1 = dist_index.minDistance(pos1, pos2);
-                                        int64_t dist2 = dist_index.minDistance(pos1, rev2);
-                                        int64_t dist3 = dist_index.minDistance(rev1, pos2);
-                                        int64_t dist4 = dist_index.minDistance(rev1, rev2);
-                                        int64_t dist = MinimumDistanceIndex::minPos({dist1, 
-                                                           dist2, dist3, dist4});
-                                        if ( dist != -1 && dist <= read_lim) {
-                                            dist_index.printSelf();
-                                            graph.serialize_to_file("testGraph");
-                                            cerr << "These should have been in the same read cluster: " ;
-                                            cerr << pos1 << " and " << pos2 << endl;
-                                            cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl;
-                                            REQUIRE(false);
+                    if (one_read_clusters.size() > 0) {
+                        for (size_t a = 0; a < one_read_clusters.size(); a++) {
+                            // For each cluster -cluster this cluster to ensure that 
+                            // there is only one
+                            cerr << a << " of " << one_read_clusters.size() << endl;
+                            vector<size_t> clust = one_read_clusters[a];
+                            
+                            structures::UnionFind new_clusters (clust.size(), false);
+
+                            for (size_t i1 = 0 ; i1 < clust.size() ; i1++) {
+                                pos_t pos1 = all_seeds[read_num][clust[i1]];
+                                size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false));
+                                pos_t rev1 = make_pos_t(get_id(pos1), 
+                                                    !is_rev(pos1),
+                                                    len1 - get_offset(pos1)-1); 
+
+                                for (size_t b = 0 ; b < one_read_clusters.size() ; b++) {
+                                    if (b != a) {
+                                        //For each other cluster
+                                        vector<size_t> clust2 = one_read_clusters[b];
+                                        for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) {
+                                            //And each position in each other cluster,
+                                            //make sure that this position is far away from i1
+                                            pos_t pos2 = all_seeds[read_num][clust2[i2]];
+                                            size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
+                                            pos_t rev2 = make_pos_t(get_id(pos2), 
+                                                             !is_rev(pos2),
+                                                             len2 - get_offset(pos2)-1); 
+
+                                            int64_t dist1 = dist_index.minDistance(pos1, pos2);
+                                            int64_t dist2 = dist_index.minDistance(pos1, rev2);
+                                            int64_t dist3 = dist_index.minDistance(rev1, pos2);
+                                            int64_t dist4 = dist_index.minDistance(rev1, rev2);
+                                            int64_t dist = MinimumDistanceIndex::minPos({dist1, 
+                                                               dist2, dist3, dist4});
+                                            if ( dist != -1 && dist <= read_lim) {
+                                                dist_index.printSelf();
+                                                graph.serialize_to_file("testGraph");
+                                                cerr << "These should have been in the same read cluster: " ;
+                                                cerr << pos1 << " and " << pos2 << endl;
+                                                cerr << dist1 << " " << dist2 << " " << dist3 << " " << dist4 << endl;
+                                                REQUIRE(false);
+                                            }
+                                            
                                         }
-                                        
                                     }
                                 }
-                            }
-                            for (size_t i2 = 0 ; i2 < clust.size() ; i2++) {
-                                //For each position in the same cluster
-                                pos_t pos2 = all_seeds[read_num][clust[i2]];
-                                size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
-                                pos_t rev2 = make_pos_t(get_id(pos2), 
-                                                     !is_rev(pos2),
-                                                     len2 - get_offset(pos2)-1); 
-                                int64_t dist1 = dist_index.minDistance(pos1, pos2);
-                                int64_t dist2 = dist_index.minDistance(pos1, rev2);
-                                int64_t dist3 = dist_index.minDistance(rev1, pos2);
-                                int64_t dist4 = dist_index.minDistance(rev1, rev2);
-                                int64_t dist = MinimumDistanceIndex::minPos({dist1, 
-                                                   dist2, dist3, dist4});
-                                if ( dist != -1 && dist <= read_lim) {
-                                    new_clusters.union_groups(i1, i2);
-                                }
+                                for (size_t i2 = 0 ; i2 < clust.size() ; i2++) {
+                                    //For each position in the same cluster
+                                    pos_t pos2 = all_seeds[read_num][clust[i2]];
+                                    size_t len2 = graph.get_length(graph.get_handle(get_id(pos2), false));
+                                    pos_t rev2 = make_pos_t(get_id(pos2), 
+                                                         !is_rev(pos2),
+                                                         len2 - get_offset(pos2)-1); 
+                                    int64_t dist1 = dist_index.minDistance(pos1, pos2);
+                                    int64_t dist2 = dist_index.minDistance(pos1, rev2);
+                                    int64_t dist3 = dist_index.minDistance(rev1, pos2);
+                                    int64_t dist4 = dist_index.minDistance(rev1, rev2);
+                                    int64_t dist = MinimumDistanceIndex::minPos({dist1, 
+                                                       dist2, dist3, dist4});
+                                    if ( dist != -1 && dist <= read_lim) {
+                                        new_clusters.union_groups(i1, i2);
+                                    }
 
+                                }
                             }
-                        }
-                        auto actual_clusters = new_clusters.all_groups();
-                        if (actual_clusters.size() != 1) {
-                                            dist_index.printSelf();
-                            graph.serialize_to_file("testGraph");
-                            cerr << "These should be different read clusters: " << endl;
-                            for (auto c : actual_clusters) {
-                                cerr << "cluster: " ; 
-                                for (size_t i1 : c) {
-                                    cerr << all_seeds[read_num][clust[i1]] << " ";
+                            auto actual_clusters = new_clusters.all_groups();
+                            if (actual_clusters.size() != 1) {
+                                                dist_index.printSelf();
+                                graph.serialize_to_file("testGraph");
+                                cerr << "These should be different read clusters: " << endl;
+                                for (auto c : actual_clusters) {
+                                    cerr << "cluster: " ; 
+                                    for (size_t i1 : c) {
+                                        cerr << all_seeds[read_num][clust[i1]] << " ";
+                                    }
+                                    cerr << endl;
                                 }
-                                cerr << endl;
                             }
+                            REQUIRE(actual_clusters.size() == 1);
                         }
-                        REQUIRE(actual_clusters.size() == 1);
                     }
                 }
                 for (size_t a = 0; a < fragment_clusters.size(); a++) {

From 645941cfea508fef37a372c165a989b148bd35f1 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 8 Nov 2019 12:43:06 -0500
Subject: [PATCH 30/79] add probablistic support caller

---
 src/algorithms/coverage_depth.cpp |  29 ++
 src/algorithms/coverage_depth.hpp |  17 +-
 src/graph_caller.cpp              |  69 ++++-
 src/graph_caller.hpp              |  17 +-
 src/snarl_caller.cpp              | 428 ++++++++++++++++++++++++++----
 src/snarl_caller.hpp              | 155 ++++++++---
 src/subcommand/call_main.cpp      |  37 ++-
 src/traversal_support.cpp         |  22 +-
 src/traversal_support.hpp         |   7 +-
 9 files changed, 667 insertions(+), 114 deletions(-)

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index b68b8c50b55..0d63f656cb9 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -130,6 +130,35 @@ vector<tuple<size_t, size_t, double, double>> binned_packed_depth(const Packer&
     return binned_depths;
 }
 
+unordered_map<string, map<size_t, pair<double, double>>> binned_packed_depth_index(const Packer& packer,
+                                                                                   const vector<string>& path_names,
+                                                                                   size_t bin_size,
+                                                                                   size_t min_coverage,
+                                                                                   bool include_deletions,
+                                                                                   bool std_err) {
+    unordered_map<string, map<size_t, pair<double, double>>> depth_index;
+    for (const string& path_name : path_names) {
+        vector<tuple<size_t, size_t, double, double>> binned_depths = binned_packed_depth(packer, path_name, bin_size,
+                                                                                          min_coverage, include_deletions);
+        // todo: probably more efficent to just leave in sorted vector
+        map<size_t, pair<double, double>>& depth_map = depth_index[path_name];
+        for (auto& binned_depth : binned_depths) {
+            double var = get<3>(binned_depth);
+            // optionally convert variance to standard error
+            if (std_err) {
+                var = sqrt(var / (double)(get<1>(binned_depth) - get<2>(binned_depth)));
+            }
+            depth_map[get<0>(binned_depth)] = make_pair(get<2>(binned_depth), var);
+        }
+    }
+
+    return depth_index;
+}
+
+const pair<double, double>& get_depth_from_index(const unordered_map<string, map<size_t, pair<double, double>>>& depth_index,
+                                          const string& path_name, size_t offset) {
+    return depth_index.at(path_name).lower_bound(offset)->second;
+}
 
 // draw (roughly) max_nodes nodes from the graph using the random seed
 static unordered_map<nid_t, size_t> sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) {
diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp
index 730cd316f5a..9084949df36 100644
--- a/src/algorithms/coverage_depth.hpp
+++ b/src/algorithms/coverage_depth.hpp
@@ -11,7 +11,6 @@
 #include "handle.hpp"
 #include "packer.hpp"
 
-
 namespace vg {
 namespace algorithms {
 
@@ -35,7 +34,21 @@ pair<double, double> packed_depth_of_bin(const Packer& packer, step_handle_t sta
 /// Use all available threads to estimate the binned packed coverage of a path using above fucntion
 /// Each element is a bin's 0-based open-ended interval in the path, and its coverage mean,variance. 
 vector<tuple<size_t, size_t, double, double>> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size,
-                                                          size_t min_coverage, bool include_deletions);
+                                                                  size_t min_coverage, bool include_deletions);
+
+/// Use the above function to retrieve the binned depths of a list of paths, and store them indexed by start
+/// coordinate.  If std_err is true, store <mean, stderr> instead of <mean, variance>
+using BinnedDepthIndex = unordered_map<string, map<size_t, pair<double, double>>>;
+BinnedDepthIndex binned_packed_depth_index(const Packer& packer,
+                                           const vector<string>& path_names,
+                                           size_t bin_size,
+                                           size_t min_coverage,
+                                           bool include_deletions,
+                                           bool std_err);
+
+/// Query index created above
+/// Todo: optionally smooth over adjacent bins?
+const pair<double, double>& get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t offset);
 
 /// Return the mean and variance of coverage of randomly sampled nodes from a GAM
 /// Nodes with less than min_coverage are ignored
diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index 4387ef0b63d..55e8d097759 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -143,8 +143,12 @@ bool VCFGenotyper::call_snarl(const Snarl& snarl) {
             }
         }
 
+        // find a path range corresponding to our snarl by way of the VCF variants.
+        tuple<string, size_t, size_t> ref_positions = get_ref_positions(variants);
+
         // use our support caller to choose our genotype (int traversal coordinates)
-        vector<int> trav_genotype = snarl_caller.genotype(snarl, travs, ref_trav_idx, 2);
+        vector<int> trav_genotype = snarl_caller.genotype(snarl, travs, ref_trav_idx, 2, get<0>(ref_positions),
+                                                          make_pair(get<1>(ref_positions), get<2>(ref_positions)));
 
         assert(trav_genotype.empty() || trav_genotype.size() == 2);
 
@@ -231,6 +235,35 @@ string VCFGenotyper::vcf_header(const PathHandleGraph& graph, const vector<strin
     return header;
 }
 
+tuple<string, size_t, size_t> VCFGenotyper::get_ref_positions(const vector<vcflib::Variant*>& variants) const {
+    // if there is more than one path in our snarl (unlikely for most graphs we'll vcf-genoetype)
+    // then we return the one with the biggest interval
+    map<string, pair<size_t, size_t>> path_offsets;
+    for (const vcflib::Variant* var : variants) {
+        if (path_offsets.count(var->sequenceName)) {
+            pair<size_t, size_t>& record = path_offsets[var->ref];
+            record.first = std::min((size_t)var->position, record.first);
+            record.second = std::max((size_t)var->position + var->ref.length(), record.second);
+        } else {
+            path_offsets[var->sequenceName] = make_pair(var->position, var->position + var->ref.length());
+        }
+    }
+
+    string ref_path;
+    size_t ref_range_size = 0;
+    pair<size_t, size_t> ref_range;
+    for (auto& path_offset : path_offsets) {
+        size_t len = path_offset.second.second - path_offset.second.first;
+        if (len > ref_range_size) {
+            ref_range_size = len;
+            ref_path = path_offset.first;
+            ref_range = path_offset.second;
+        }
+    }
+
+    return make_tuple(ref_path, ref_range.first, ref_range.second);
+}
+
 unordered_map<string, size_t> VCFGenotyper::scan_contig_lengths() const {
 
     unordered_map<string, size_t> ref_lengths;
@@ -387,7 +420,8 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) {
         string path_name = find_index(snarl, is_vg ? path_indexes : site_path_indexes).first;
 
         // orient the snarl along the reference path
-        if (get_ref_position(snarl, path_name).second == true) {
+        tuple<size_t, size_t, bool> ref_interval = get_ref_interval(snarl, path_name);
+        if (get<2>(ref_interval) == true) {
             snarl_manager.flip(&snarl);
         }
 
@@ -396,12 +430,14 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) {
         // these integers map the called traversals to their positions in the list of all traversals
         // of the top level snarl.  
         vector<int> genotype;
-        std::tie(called_traversals, genotype) = top_down_genotype(snarl, *rep_trav_finder, 2);
+        std::tie(called_traversals, genotype) = top_down_genotype(snarl, *rep_trav_finder, 2,
+                                                                  path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval)));
     
         if (!called_traversals.empty()) {
             // regenotype our top-level traversals now that we know they aren't nested, and we have a
             // good idea of all the sizes
-            std::tie(called_traversals, genotype) = re_genotype(snarl, *rep_trav_finder, called_traversals, genotype, 2);
+            std::tie(called_traversals, genotype) = re_genotype(snarl, *rep_trav_finder, called_traversals, genotype, 2,
+                                                                path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval)));
 
             // emit our vcf variant
             emit_variant(snarl, *rep_trav_finder, called_traversals, genotype, path_name);
@@ -431,13 +467,14 @@ string LegacyCaller::vcf_header(const PathHandleGraph& graph, const vector<strin
     return header;
 }
 
-pair<vector<SnarlTraversal>, vector<int>> LegacyCaller::top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy) const {
+pair<vector<SnarlTraversal>, vector<int>> LegacyCaller::top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy,
+                                                                          const string& ref_path_name, pair<size_t, size_t> ref_interval) const {
 
     // get the traversals through the site
     vector<SnarlTraversal> traversals = trav_finder.find_traversals(snarl);
 
     // use our support caller to choose our genotype
-    vector<int> trav_genotype = snarl_caller.genotype(snarl, traversals, 0, ploidy);
+    vector<int> trav_genotype = snarl_caller.genotype(snarl, traversals, 0, ploidy, ref_path_name, ref_interval);
     if (trav_genotype.empty()) {
         return make_pair(vector<SnarlTraversal>(), vector<int>());
     }
@@ -470,7 +507,7 @@ pair<vector<SnarlTraversal>, vector<int>> LegacyCaller::top_down_genotype(const
                     snarl_manager.flip(into_snarl);
                 }
                 vector<SnarlTraversal> child_genotype = top_down_genotype(*into_snarl,
-                                                                          trav_finder, hom ? 2: 1).first;                
+                                                                          trav_finder, hom ? 2: 1, ref_path_name, ref_interval).first;                
                 if (child_genotype.empty()) {
                     return make_pair(vector<SnarlTraversal>(), vector<int>());
                 }
@@ -533,9 +570,11 @@ SnarlTraversal LegacyCaller::get_reference_traversal(const Snarl& snarl, Travers
 pair<vector<SnarlTraversal>, vector<int>> LegacyCaller::re_genotype(const Snarl& snarl, TraversalFinder& trav_finder,
                                                                     const vector<SnarlTraversal>& in_traversals,
                                                                     const vector<int>& in_genotype,
-                                                                    int ploidy) const {
+                                                                    int ploidy,
+                                                                    const string& ref_path_name,
+                                                                    pair<size_t, size_t> ref_interval) const {
     assert(in_traversals.size() == in_genotype.size());
-
+    
     // create a set of unique traversal candidates that must include the reference first
     vector<SnarlTraversal> rg_traversals;
     // add our reference traversal to the front
@@ -556,7 +595,7 @@ pair<vector<SnarlTraversal>, vector<int>> LegacyCaller::re_genotype(const Snarl&
     }
 
     // re-genotype the candidates
-    vector<int> rg_genotype = snarl_caller.genotype(snarl, rg_traversals, 0, ploidy);
+    vector<int> rg_genotype = snarl_caller.genotype(snarl, rg_traversals, 0, ploidy, ref_path_name, ref_interval);
 
     // convert our output to something that emit_variant() will understand
     // todo: this is needlessly inefficient and should be streamlined to operate
@@ -634,7 +673,7 @@ void LegacyCaller::emit_variant(const Snarl& snarl, TraversalFinder& trav_finder
     // fill out the rest of the variant
     out_variant.sequenceName = ref_path_name;
     // +1 to convert to 1-based VCF
-    out_variant.position = get_ref_position(snarl, ref_path_name).first + ref_offsets.find(ref_path_name)->second + 1; 
+    out_variant.position = get<0>(get_ref_interval(snarl, ref_path_name)) + ref_offsets.find(ref_path_name)->second + 1; 
     out_variant.id = std::to_string(snarl.start().node_id()) + "_" + std::to_string(snarl.end().node_id());
     out_variant.filter = "PASS";
     out_variant.updateAlleleIndexes();
@@ -691,7 +730,7 @@ pair<string, PathIndex*> LegacyCaller::find_index(const Snarl& snarl, const vect
     return make_pair("", nullptr);
 }
 
-pair<size_t, bool> LegacyCaller::get_ref_position(const Snarl& snarl, const string& ref_path_name) const {
+tuple<size_t, size_t, bool> LegacyCaller::get_ref_interval(const Snarl& snarl, const string& ref_path_name) const {
     path_handle_t path_handle = graph.get_path_handle(ref_path_name);
 
     handle_t start_handle = graph.get_handle(snarl.start().node_id(), snarl.start().backward());
@@ -743,7 +782,11 @@ pair<size_t, bool> LegacyCaller::get_ref_position(const Snarl& snarl, const stri
     size_t end_position = end_step == end_steps.begin()->second ? end_steps.begin()->first : graph.get_position_of_step(end_step);
     bool backward = end_position < start_position;
 
-    return make_pair(backward ? end_position : start_position, backward);
+    if (backward) {
+        return make_tuple(end_position, start_position, backward);
+    } else {
+        return make_tuple(start_position, end_position, backward);
+    }
 }
 
 void LegacyCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool backward) const {
diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp
index e4f1367f2e7..2f4d8c38bbb 100644
--- a/src/graph_caller.hpp
+++ b/src/graph_caller.hpp
@@ -102,6 +102,9 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller {
 
 protected:
 
+    /// get path positions bounding a set of variants
+    tuple<string, size_t, size_t>  get_ref_positions(const vector<vcflib::Variant*>& variants) const;
+
     /// munge out the contig lengths from the VCF header
     virtual unordered_map<string, size_t> scan_contig_lengths() const;
 
@@ -145,7 +148,8 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller {
 
     /// recursively genotype a snarl
     /// todo: can this be pushed to a more generic class? 
-    pair<vector<SnarlTraversal>, vector<int>> top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy) const;
+    pair<vector<SnarlTraversal>, vector<int>> top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy,
+                                                                const string& ref_path_name, pair<size_t, size_t> ref_interval) const;
     
     /// we need the reference traversal for VCF, but if the ref is not called, the above method won't find it. 
     SnarlTraversal get_reference_traversal(const Snarl& snarl, TraversalFinder& trav_finder) const;
@@ -153,10 +157,13 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller {
     /// re-genotype output of top_down_genotype.  it may give slightly different results as
     /// it's working with fully-defined traversals and can exactly determine lengths and supports
     /// it will also make sure the reference traversal is in the beginning of the output
-    pair<vector<SnarlTraversal>, vector<int>> re_genotype(const Snarl& snarl, TraversalFinder& trav_finder,
+    pair<vector<SnarlTraversal>, vector<int>> re_genotype(const Snarl& snarl,
+                                                          TraversalFinder& trav_finder,
                                                           const vector<SnarlTraversal>& in_traversals,
                                                           const vector<int>& in_genotype,
-                                                          int ploidy) const;
+                                                          int ploidy,
+                                                          const string& ref_path_name,
+                                                          pair<size_t, size_t> ref_interval) const;
 
     /// print a vcf variant 
     void emit_variant(const Snarl& snarl, TraversalFinder& trav_finder, const vector<SnarlTraversal>& called_traversals,
@@ -168,9 +175,9 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller {
     /// look up a path index for a site and return its name too
     pair<string, PathIndex*> find_index(const Snarl& snarl, const vector<PathIndex*> path_indexes) const;
 
-    /// get the position of a snarl from our reference path using the PathPositionHandleGraph interface
+    /// get the interval of a snarl from our reference path using the PathPositionHandleGraph interface
     /// the bool is true if the snarl's backward on the path
-    pair<size_t, bool> get_ref_position(const Snarl& snarl, const string& ref_path_name) const;
+    tuple<size_t, size_t, bool> get_ref_interval(const Snarl& snarl, const string& ref_path_name) const;
 
     /// clean up the alleles to not share common prefixes / suffixes
     void flatten_common_allele_ends(vcflib::Variant& variant, bool backward) const;
diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index e13c66fb6e9..77393dce867 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -1,7 +1,7 @@
 #include "snarl_caller.hpp"
 #include "genotypekit.hpp"
 
-//#define debug
+#define debug
 
 namespace vg {
 
@@ -18,23 +18,28 @@ SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, S
     graph(graph),
     snarl_manager(snarl_manager),
     support_finder(support_finder) {
+
 }
 
 SupportBasedSnarlCaller::~SupportBasedSnarlCaller() {
     
 }
 
-void SupportBasedSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) {
-    // want to move away from ugly hacks that treat the reference traversal differently,
-    // so keep all these set the same
-    if (het_bias >= 0) {
-        max_het_bias = het_bias;
-        max_ref_het_bias = het_bias;
-        max_indel_het_bias = het_bias;
-    }
-    if (ref_het_bias >= 0) {
-        max_ref_het_bias = ref_het_bias;
-    }
+void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl,
+                                              const vector<SnarlTraversal>& traversals,
+                                              const vector<int>& genotype,
+                                              const string& sample_name,
+                                              vcflib::Variant& variant) {
+    
+    
+}
+
+const TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const {
+    return support_finder;
+}
+
+int SupportBasedSnarlCaller::get_min_total_support_for_call() const {
+    return min_total_support_for_call;
 }
 
 void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support) {
@@ -49,10 +54,45 @@ void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double m
     }
 }
 
-vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
+int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, const vector<int>& skips) {
+    int best_allele = -1;
+    for(size_t i = 0; i < supports.size(); i++) {
+        if(std::find(skips.begin(), skips.end(), i) == skips.end() && (
+               best_allele == -1 || support_val(supports[best_allele]) <= support_val(supports[i]))) {
+            best_allele = i;
+        }
+    }
+    return best_allele;
+}
+
+RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
+                                                 TraversalSupportFinder& support_finder) :
+    SupportBasedSnarlCaller(graph, snarl_manager, support_finder)  {
+}
+
+RatioSupportSnarlCaller::~RatioSupportSnarlCaller() {
+    
+}
+
+void RatioSupportSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) {
+    // want to move away from ugly hacks that treat the reference traversal differently,
+    // so keep all these set the same
+    if (het_bias >= 0) {
+        max_het_bias = het_bias;
+        max_ref_het_bias = het_bias;
+        max_indel_het_bias = het_bias;
+    }
+    if (ref_het_bias >= 0) {
+        max_ref_het_bias = ref_het_bias;
+    }
+}
+
+vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
                                               const vector<SnarlTraversal>& traversals,
                                               int ref_trav_idx,
-                                              int ploidy) {
+                                              int ploidy,
+                                              const string& ref_path_name,
+                                              pair<size_t, size_t> ref_range) { 
 
 #ifdef debug
     cerr << "Support calling site " << pb2json(snarl) << endl;
@@ -62,7 +102,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
     vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
 
     // get the supports of each traversal independently
-    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, false, false, ref_trav_idx);
+    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx);
     int best_allele = get_best_support(supports, {});
 
 #ifdef debug
@@ -77,7 +117,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
 
     // we prune out traversals whose exclusive support (structure that is not shared with best traversal)
     // doesn't meet a certain cutoff
-    vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, ref_trav_idx);    
+    vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx);    
     vector<int> skips = {best_allele};
     for (int i = 0; i < secondary_exclusive_supports.size(); ++i) {
         double bias = get_bias(traversal_sizes, i, best_allele, ref_trav_idx);
@@ -90,7 +130,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
         }
     }
     // get the supports of each traversal in light of best
-    vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, ref_trav_idx);
+    vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx);
     int second_best_allele = get_best_support(secondary_supports, {skips});
 
     // get the supports of each traversal in light of second best
@@ -99,7 +139,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
     int third_best_allele = -1;
     if (second_best_allele != -1) {
         // prune out traversals whose exclusive support relative to second best doesn't pass cut
-        vector<Support> tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, ref_trav_idx);
+        vector<Support> tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, false, ref_trav_idx);
         skips.push_back(best_allele);
         skips.push_back(second_best_allele);
         for (int i = 0; i < tertiary_exclusive_supports.size(); ++i) {
@@ -108,7 +148,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
                 skips.push_back(i);
             }
         }
-        tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, ref_trav_idx);
+        tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, false, ref_trav_idx);
         third_best_allele = get_best_support(tertiary_supports, skips);
     }
 
@@ -151,7 +191,7 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
 
     // Single ploidy case when doing recursive genotyping.  Just return the best allele
     if (ploidy == 1) {
-        return {best_allele};
+        return vector<int>(1, best_allele);
     }
     // Call 1/2 : REF-Alt1/Alt2 even if Alt2 has only third best support
     else if (ploidy >= 2 &&
@@ -238,12 +278,14 @@ vector<int> SupportBasedSnarlCaller::genotype(const Snarl& snarl,
             
     }
 
+    // Todo: specify call_info to use new interface, then fix up update_vcf_info to read it,
+    // and move common logic up to SupportBasedCaller if possible.
     return genotype;
 }
 
-void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl,
+void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
                                               const vector<SnarlTraversal>& traversals,
-                                              const vector<int>& genotype,                                         
+                                              const vector<int>& genotype,
                                               const string& sample_name,
                                               vcflib::Variant& variant) {
 
@@ -255,11 +297,11 @@ void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl,
         shared_travs.push_back(genotype[0]);
     }
     // compute the support of our called alleles
-    vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, 0);
+    // todo: I think this undercounts support.  shuold be fixed (as in Poisson version)
+    vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, false, 0);
 
     // get the support of our uncalled alleles, making sure to not include any called support
-    // TODO: handle shared support within this set
-    vector<Support> uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, 0);
+    vector<Support> uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, 0);
         
     // Set up the depth format field
     variant.format.push_back("DP");
@@ -353,7 +395,7 @@ void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl,
     }
 }
 
-void SupportBasedSnarlCaller::update_vcf_header(string& header) const {
+void RatioSupportSnarlCaller::update_vcf_header(string& header) const {
     header += "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n";
     header += "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n";
     header += "##FORMAT=<ID=XADL,Number=1,Type=Float,Description=\"Likelihood of allelic depths for called alleles\">\n";
@@ -368,18 +410,7 @@ void SupportBasedSnarlCaller::update_vcf_header(string& header) const {
         std::to_string(min_site_depth) + "\">\n";    
 }
 
-int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, const vector<int>& skips) {
-    int best_allele = -1;
-    for(size_t i = 0; i < supports.size(); i++) {
-        if(std::find(skips.begin(), skips.end(), i) == skips.end() && (
-               best_allele == -1 || support_val(supports[best_allele]) <= support_val(supports[i]))) {
-            best_allele = i;
-        }
-    }
-    return best_allele;
-}
-
-function<bool(const SnarlTraversal&)> SupportBasedSnarlCaller::get_skip_allele_fn() const {
+function<bool(const SnarlTraversal&)> RatioSupportSnarlCaller::get_skip_allele_fn() const {
     // port over cutoff used in old support caller (there avg support used all the time, here
     // we use the same toggles as when genotyping)
     return [&](const SnarlTraversal& trav) -> bool {
@@ -387,17 +418,7 @@ function<bool(const SnarlTraversal&)> SupportBasedSnarlCaller::get_skip_allele_f
     };
 }
 
-
-int SupportBasedSnarlCaller::get_min_total_support_for_call() const {
-    return min_total_support_for_call;
-}
-
-const TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const {
-    return support_finder;
-}
-
-
-double SupportBasedSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
+double RatioSupportSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
                                          int second_best_trav, int ref_trav_idx) const {
     bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) ||
                      (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx]));
@@ -430,6 +451,315 @@ double SupportBasedSnarlCaller::get_bias(const vector<int>& traversal_sizes, int
 }
 
 
+PoissonSupportSnarlCaller::PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
+                                                     TraversalSupportFinder& support_finder,
+                                                     const algorithms::BinnedDepthIndex& depth_index) :
+    SupportBasedSnarlCaller(graph, snarl_manager, support_finder),
+    depth_index(depth_index) {
+    
+}
+    
+PoissonSupportSnarlCaller::~PoissonSupportSnarlCaller() {
+    
+}
+
+vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
+                                                const vector<SnarlTraversal>& traversals,
+                                                int ref_trav_idx,
+                                                int ploidy,
+                                                const string& ref_path_name,
+                                                pair<size_t, size_t> ref_range) {
+    
+    
+#ifdef debug
+    cerr << "Poisson Support calling site " << pb2json(snarl)
+         << " on path " << ref_path_name << ":" << ref_range.first << "-" << ref_range.second << endl;
+#endif
+
+    assert(ploidy == 2 || ploidy == 1);
+    
+    // get the traversal sizes
+    vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
+
+    // get the supports of each traversal independently
+    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx);
+
+    // sort the traversals by support
+    vector<int> ranked_traversals = rank_by_support(supports);
+    size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size());
+
+    // the candidate genotypes and their supports.  the numbers here are alleles as indexed in traversals[]
+    map<vector<int>, vector<Support>> candidates;
+
+    // consider each of the top 25 traversals as our top_traversal
+    for (int i = 0; i < max_trav; ++i) {
+        
+        int best_allele = ranked_traversals[i];
+
+        if (ploidy == 1) {
+            candidates[{best_allele}] = {supports[best_allele]}; 
+        } else {
+            assert(ploidy == 2);
+        
+            // we prune out traversals whose exclusive support (structure that is not shared with best traversal)
+            // doesn't meet a certain cutoff
+            vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx);
+            set<int> skips = {best_allele};
+            for (int j = 0; j < secondary_exclusive_supports.size(); ++j) {
+                if (j != best_allele && support_val(secondary_exclusive_supports[j]) <= min_total_support_for_call) {
+                    skips.insert(j);
+                }
+            }
+
+            // get the supports of each traversal in light of best
+            vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx);
+            vector<int> ranked_secondary_traversals = rank_by_support(secondary_supports);
+
+            // add the homozygous genotype for our best allele
+            candidates[{best_allele, best_allele}] = {supports[best_allele], supports[best_allele]};
+
+            // now look at the top-k second-best traversals
+            size_t sec_count = 0;
+            for (int j = 0; j < ranked_secondary_traversals.size() && sec_count < top_k; ++j) {
+                int second_best_allele = ranked_secondary_traversals[j];
+                if (!skips.count(second_best_allele) && second_best_allele != best_allele) {
+                    // second best allele's support, sharing nodes with best
+                    Support& second_best_support = secondary_supports[second_best_allele];
+                    // best allele's support, sharing nodes with second best
+                    Support best_support_het = support_finder.get_traversal_set_support(
+                        {traversals[best_allele], traversals[second_best_allele]},
+                        {1}, false, false, false, ref_trav_idx)[0];
+                                
+                    // canonical ordering for our set
+                    if (best_allele < second_best_allele) {
+                        candidates[{best_allele, second_best_allele}] = {best_support_het, second_best_support};
+                    } else {
+                        candidates[{second_best_allele, best_allele}] = {second_best_support, best_support_het};
+                    }
+                    // also make sure we have our homozygous genotype for the second best allele
+                    candidates[{second_best_allele, second_best_allele}] = {supports[second_best_allele], supports[second_best_allele]};
+                    ++sec_count;
+                }
+            }
+        }
+    }
+
+    // expected depth from our coverage
+    const pair<double, double>& start_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first);
+    const pair<double, double>& end_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.second);
+    double exp_depth = (start_depth.first + end_depth.first) / 2.;
+    double depth_err = (start_depth.second + end_depth.second) / 2.;
+    assert(!isnan(exp_depth) && !isnan(depth_err));
+
+    // genotype (log) likelihoods
+    double best_genotype_likelihood = -numeric_limits<double>::max();
+    vector<int> best_genotype;
+    for (const auto& candidate : candidates) {
+        double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err);
+        if (gl > best_genotype_likelihood) {
+            best_genotype_likelihood = gl;
+            best_genotype = candidate.first;
+        }
+    }
+
+    return best_genotype;
+}
+
+double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotype,
+                                                      const vector<Support>& genotype_supports,
+                                                      const vector<SnarlTraversal>& traversals,
+                                                      int ref_trav_idx, double exp_depth, double depth_err) {
+    
+    assert(genotype_supports.size() == genotype.size());
+    assert(genotype.size() == 1 || genotype.size() == 2);
+
+
+    // we need the support of all traversals *not* in the genotype.
+    Support total_other_support;
+    // we are running in a mode that will ignore stuff in our genotype, and only count the remainders once.
+    vector<Support> other_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, ref_trav_idx);
+    for (auto& other_support : other_supports) {
+        total_other_support += other_support;
+    }
+
+    // split the homozygous support into two
+    // from now on we'll treat it like two separate observations, each with half coverage
+    vector<Support> fixed_genotype_supports = genotype_supports;
+    if (std::equal(genotype_supports.begin() + 1, genotype_supports.end(), genotype_supports.begin(),
+                   [&](const Support& s1, const Support& s2) { return support_val(s1) == support_val(s2); })) {
+        for (int i = 0; i < genotype_supports.size(); ++i) {
+            fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size();
+        }
+    }
+    
+    // total support of the site
+    Support total_site_support = total_other_support;
+    for (auto& support : fixed_genotype_supports) {
+        total_site_support += support;
+    }
+
+    // how many reads would we expect to not map to our genotype due to error
+    double error_rate = std::min(0.95, depth_err + baseline_mapping_error);
+    double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support);
+
+    // and our likelihood for the unmapped reads we see:
+    double other_log_likelihood = poisson_prob_ln(std::round(support_val(total_other_support)), other_poisson_lambda);
+
+    // how many reads do we expect for an allele?  we use the expected coverage and just
+    // divide it out by the size of the genotype.  
+    double allele_poisson_lambda = (exp_depth / (double)genotype.size()) * (1. - error_rate);
+
+#ifdef debug
+    cerr << "Computing prob of genotype: {";
+    for (int i = 0; i < genotype.size(); ++i) {
+        cerr << genotype[i] << ",";
+    } 
+    cerr << "}: tot_other_sup = " << total_other_support << " tot site sup = " << total_site_support 
+         << " exp-depth = " << exp_depth << " depth-err = " << depth_err << " other-lambda = " << other_poisson_lambda
+         << " allele-lambda " << allele_poisson_lambda << endl;
+#endif
+    
+    // now we compute the likelihood of our genotype
+    double alleles_log_likelihood = 0;
+    for (int i = 0; i < fixed_genotype_supports.size(); ++i) {
+        double allele_ll = poisson_prob_ln(std::round(support_val(fixed_genotype_supports[i])), allele_poisson_lambda);        
+        alleles_log_likelihood += allele_ll;
+
+#ifdef debug
+        cerr << "  a[" << i <<"]=" << genotype[i] << " sup=" << genotype_supports[i] << " fix-sup=" << fixed_genotype_supports[i]
+             << " prob " << allele_ll << endl;
+#endif        
+    }
+
+#ifdef debug
+    cerr  << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood
+          << " total-prob " << (alleles_log_likelihood + other_log_likelihood) << endl;
+#endif
+
+    return alleles_log_likelihood + other_log_likelihood;
+}
+
+void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
+                                                const vector<SnarlTraversal>& traversals,
+                                                const vector<int>& genotype,
+                                                const string& sample_name,
+                                                vcflib::Variant& variant) {
+
+    assert(traversals.size() == variant.alleles.size());
+
+    // Get the depth of the site
+    
+    // get the unique supports (useful only for getting a total)
+    vector<Support> unique_supports = support_finder.get_traversal_set_support(traversals, {}, false, true, true, 0);
+    Support site_support;
+    for (const Support& sup : unique_supports) {
+        site_support += sup;
+    }
+    double total_site_depth = support_val(site_support);
+
+    // Set the variant's total depth            
+    string depth_string = std::to_string((int64_t)round(total_site_depth));
+    variant.format.push_back("DP");
+    variant.info["DP"].push_back(depth_string); // We only have one sample, so variant depth = sample depth
+            
+    // And for the sample
+    variant.samples[sample_name]["DP"].push_back(depth_string);            
+
+    // get the allele depths
+    variant.format.push_back("AD");
+    set<int> called_allele_set(genotype.begin(), genotype.end());
+
+    for (int i = 0; i < traversals.size(); ++i) {
+        vector<int> shared_travs;
+        bool in_genotype = called_allele_set.count(i);
+        if (in_genotype) {
+            // if we're in the genotype, then we share support with other alleles.
+            for (int a : called_allele_set) {
+                if (a != i) {
+                    shared_travs.push_back(a);
+                }
+            }
+        } else {
+            // if we're not in the genotype, then we ignore support of everything in the genotype
+            shared_travs = genotype;
+        }
+        // we recompute all supports for each allele to get it's support relative to the genotype
+        // there is certainly room for optimization via remembering some of this stuff here
+        vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false);
+        variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i]))));
+    }
+
+    // get the genotype likelihoods
+    // as above, there's some overlap in these computations as those used in genotype() to begin with
+    // this is an issue with the class interface which probably tries too hard to avoid being VCF-dependent
+    // but if it causes a slowdown (hasn't seemed to be a factor so far), the code could be re-organized
+    // to either store some of this information, or comptue the genotype and vcf fields in a single shot
+    variant.format.push_back("GL");
+
+    // expected depth from our coverage
+    pair<size_t, size_t> ref_range = make_pair(variant.position, variant.position + variant.ref.length());
+    const pair<double, double>& start_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first);
+    const pair<double, double>& end_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.second);
+    double exp_depth = (start_depth.first + end_depth.first) / 2.;
+    double depth_err = (start_depth.second + end_depth.second) / 2.;
+    assert(!isnan(exp_depth) && !isnan(depth_err));
+
+    // assume ploidy 2
+    for (int i = 0; i < traversals.size(); ++i) {
+        for (int j = i; j < traversals.size(); ++j) {
+            vector<Support> genotype_supports;
+            if (i == j) {
+                // put the full support of allele for each copy of homozygous (genotype method expects this)
+                vector<Support> gt_supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false);
+                genotype_supports = {gt_supports[i], gt_supports[i]};
+            } else {
+                // compute each support relative to the other
+                // todo: we can speed this up by saving above, or filtering down traversal list to just our genotype alleles
+                vector<Support> gt_supports = support_finder.get_traversal_set_support(traversals, {j}, false, false, false);
+                genotype_supports.push_back(gt_supports[i]);
+                gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false);
+                genotype_supports.push_back(gt_supports[j]);
+            }
+            double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err);
+            // convert from natural log to log10 by dividing by ln(10)
+            gl /= 2.30258;
+            variant.samples[sample_name]["GL"].push_back(std::to_string(gl));
+        }
+    }
+
+    // todo
+    /*
+    // Now do the filters
+    variant.filter = "PASS";            
+    if (min_site_support < min_mad_for_filter) {
+        // Apply Min Allele Depth cutoff across all alleles (even ref)
+        variant.filter = "lowad";
+    } else if (min_ad_log_likelihood_for_filter != 0 &&
+               ad_log_likelihood < min_ad_log_likelihood_for_filter) {
+        // We have a het, but the assignment of reads between the two branches is just too weird
+        variant.filter = "lowxadl";
+    } else if ((int64_t)round(total(total_support)) < min_site_depth) {
+        // we don't have enough support to want to make a call
+        variant.filter = "lowdepth";
+    }
+    */
+}
+
+void PoissonSupportSnarlCaller::update_vcf_header(string& header) const {
+
+
+}
+
+vector<int> PoissonSupportSnarlCaller::rank_by_support(const vector<Support>& supports) {
+    vector<int> ranks(supports.size());
+    for (int i = 0; i < supports.size(); ++i) {
+        ranks[i] = i;
+    }
+    std::sort(ranks.begin(), ranks.end(), [&](int a, int b) {
+            return support_val(supports[a]) > support_val(supports[b]);
+        });
+    return ranks;
+}
 
 
 }
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index 40b4bab6784..1c3deace343 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -12,6 +12,7 @@
 #include "snarls.hpp"
 #include "genotypekit.hpp"
 #include "traversal_support.hpp"
+#include "algorithms/coverage_depth.hpp"
 
 namespace vg {
 
@@ -27,11 +28,18 @@ class SnarlCaller {
     virtual ~SnarlCaller();
 
     /// Get the genotype of a site
+    /// snarl : site
+    /// traversals : all traversals to consider
+    /// ref_trav_idx : index of reference path traversal in traversals (in case it needs special treatment)
+    /// ref_path : the reference path associated with the snarl
+    /// ref_range : the interval along the reference path (forward coordinates) spanned by snarl
     virtual vector<int> genotype(const Snarl& snarl,
                                  const vector<SnarlTraversal>& traversals,
                                  int ref_trav_idx,
-                                 int ploidy) = 0;
-
+                                 int ploidy,
+                                 const string& ref_path_name,
+                                 pair<size_t, size_t> ref_range) = 0;
+    
     /// Update INFO and FORMAT fields of the called variant
     virtual void update_vcf_info(const Snarl& snarl,
                                  const vector<SnarlTraversal>& traversals,
@@ -47,23 +55,81 @@ class SnarlCaller {
 };
 
 /**
- * Find the genotype of some traversals in a site using read support
+ * Interface for a caller that relies on a TraversalSupportFinder
+ * and has a few very basic support-based cutoffs
+ * Not every exciting but is currently required for the LegacySupportCaller
+ * which needs this to interface with the RepresentativeTraversalFinder
  */ 
 class SupportBasedSnarlCaller : public SnarlCaller {
 public:
     SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
                             TraversalSupportFinder& support_finder);
+
     virtual ~SupportBasedSnarlCaller();
 
+    virtual void update_vcf_info(const Snarl& snarl,
+                                 const vector<SnarlTraversal>& traversals,
+                                 const vector<int>& genotype,
+                                 const string& sample_name,
+                                 vcflib::Variant& variant);
+
     /// Set some of the parameters
-    void set_het_bias(double het_bias, double ref_het_bias = 0.);
     void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support);
+    
+    /// Get the traversal support finder
+    const TraversalSupportFinder& get_support_finder() const;
+
+    /// Get the minimum total support for call
+    virtual int get_min_total_support_for_call() const;
+
+protected:
+
+    /// Get the best support out of a list of supports, ignoring skips
+    static int get_best_support(const vector<Support>& supports, const vector<int>& skips);
+
+    /// Relic from old code
+    static double support_val(const Support& support) { return total(support); };
+
+    const PathHandleGraph& graph;
+
+    SnarlManager& snarl_manager;    
+
+    /// Get support from traversals
+    TraversalSupportFinder& support_finder;
+    
+    /// What's the minimum integer number of reads that must support a call? We
+    /// don't necessarily want to call a SNP as het because we have a single
+    // supporting read, even if there are only 10 reads on the site.
+    int min_total_support_for_call = 1;
+    /// what's the minimum ref or alt allele depth to give a PASS in the filter
+    /// column? Also used as a min actual support for a second-best allele call
+    size_t min_mad_for_filter = 1;
+    /// what's the minimum total support (over all alleles) of the site to make
+    /// a call
+    size_t min_site_depth = 3;
+};
+
+
+/**
+ * Find the genotype of some traversals in a site using read support and
+ * a bias ratio to tell heterozygous from homozygous
+ */ 
+class RatioSupportSnarlCaller : public SupportBasedSnarlCaller {
+public:
+    RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
+                            TraversalSupportFinder& support_finder);
+    virtual ~RatioSupportSnarlCaller();
+
+    /// Set some of the parameters
+    void set_het_bias(double het_bias, double ref_het_bias = 0.);
 
     /// Get the genotype of a site
     virtual vector<int> genotype(const Snarl& snarl,
                                  const vector<SnarlTraversal>& traversals,
                                  int ref_trav_idx,
-                                 int ploidy);
+                                 int ploidy,
+                                 const string& ref_path_name,
+                                 pair<size_t, size_t> ref_range);
 
     /// Update INFO and FORMAT fields of the called variant
     virtual void update_vcf_info(const Snarl& snarl,
@@ -78,20 +144,8 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// Use min_alt_path_support threshold as cutoff
     virtual function<bool(const SnarlTraversal&)> get_skip_allele_fn() const;
 
-    /// Get the minimum total support for call
-    virtual int get_min_total_support_for_call() const;
-
-    /// Get the traversal support finder
-    const TraversalSupportFinder& get_support_finder() const;
-
 protected:
 
-    /// Get the best support out of a list of supports, ignoring skips
-    static int get_best_support(const vector<Support>& supports, const vector<int>& skips);
-
-    /// Relic from old code
-    static double support_val(const Support& support) { return total(support); };
-
     /// Get the bias used to for comparing two traversals
     /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel
     ///  see tuning parameters below)
@@ -104,10 +158,6 @@ class SupportBasedSnarlCaller : public SnarlCaller {
 
     /// Tuning
 
-    /// What's the minimum integer number of reads that must support a call? We
-    /// don't necessarily want to call a SNP as het because we have a single
-    // supporting read, even if there are only 10 reads on the site.
-    int min_total_support_for_call = 1;
     /// What fraction of the reads supporting an alt are we willing to discount?
     /// At 2, if twice the reads support one allele as the other, we'll call
     /// homozygous instead of heterozygous. At infinity, every call will be
@@ -120,26 +170,71 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// Used for calling 1/2 calls.  If both alts (times this bias) are greater than
     /// the reference, the call is made.  set to 0 to deactivate.
     double max_ma_bias = 0;
-    /// what's the minimum ref or alt allele depth to give a PASS in the filter
-    /// column? Also used as a min actual support for a second-best allele call
-    size_t min_mad_for_filter = 1;
-    /// what's the minimum total support (over all alleles) of the site to make
-    /// a call
-    size_t min_site_depth = 3;
     /// what's the min log likelihood for allele depth assignments to PASS?
     double min_ad_log_likelihood_for_filter = -9;
     /// used only for pruning alleles in the VCFTraversalFinder:  minimum support
     /// of an allele's alt-path for it to be considered in the brute-force enumeration
     double min_alt_path_support = 0.2;
+    
+};
 
-    const PathHandleGraph& graph;
+/**
+ * Find the genotype of some traversals in a site using read support 
+ * and a Poisson model based on expected depth.  Inspired, in part,
+ * by Paragraph, which uses a similar approach for genotyping break points
+ * 
+ **/ 
+class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
+public:
+    PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
+                              TraversalSupportFinder& support_finder,
+                              const algorithms::BinnedDepthIndex& depth_index);
+    virtual ~PoissonSupportSnarlCaller();
+
+    /// Get the genotype of a site
+    virtual vector<int>  genotype(const Snarl& snarl,
+                                       const vector<SnarlTraversal>& traversals,
+                                       int ref_trav_idx,
+                                       int ploidy,
+                                       const string& ref_path_name,
+                                       pair<size_t, size_t> ref_range);
+    
+    /// Update INFO and FORMAT fields of the called variant
+    virtual void update_vcf_info(const Snarl& snarl,
+                                 const vector<SnarlTraversal>& traversals,
+                                 const vector<int>& genotype,
+                                 const string& sample_name,
+                                 vcflib::Variant& variant);
 
-    SnarlManager& snarl_manager;
+    /// Define any header fields needed by the above
+    virtual void update_vcf_header(string& header) const;
 
-    TraversalSupportFinder& support_finder;
+protected:
+
+    /// Compute likelihood of genotype as product of poisson probabilities
+    /// P[allele1] * P[allle2] * P[uncalled alleles]
+    /// Homozygous alleles are split into two, with half support each
+    /// The (natural) logoarithm is returned
+    double genotype_likelihood(const vector<int>& genotype,
+                               const vector<Support>& genotype_supports,
+                               const vector<SnarlTraversal>& traversals,
+                               int ref_trav_idx, double exp_depth, double depth_err);
+
+    /// Rank supports
+    vector<int> rank_by_support(const vector<Support>& supports);
+
+    /// Baseline mapping error rate (gets added to the standard error from coverage)
+    double baseline_mapping_error = 0.05;
+
+    /// Consider up to the top-k traversals (based on support) for genotyping
+    size_t top_k = 25;
     
+    /// Map path name to <mean, std_err> of depth coverage from the packer
+    const algorithms::BinnedDepthIndex& depth_index;
+
 };
 
+
 // debug helpers
 inline string to_string(const HandleGraph& graph, handle_t handle) {
     return std::to_string(graph.get_id(handle)) + ":" + std::to_string(graph.get_is_reverse(handle));
diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index c2ebac17dda..fbe049486a4 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -28,8 +28,9 @@ void help_call(char** argv) {
        << endl
        << "support calling options:" << endl
        << "    -k, --pack FILE         Supports created from vg pack for given input graph" << endl
-       << "    -b, --het-bias M,N      Homozygous alt/ref allele must have >= M/N times more support than the next best allele [default = 6,6]" << endl
        << "    -m, --min-support M,N   Minimum allele support (M) and minimum site support (N) for call [default = 1,4]" << endl
+       << "    -B, --bias-mode         Use old ratio-based genotyping algorithm as opposed to porbablistic model" << endl
+       << "    -b, --het-bias M,N      Homozygous alt/ref allele must have >= M/N times more support than the next best allele [default = 6,6]" << endl
        << "general options:" << endl
        << "    -v, --vcf FILE          VCF file to genotype (must have been used to construct input graph with -a)" << endl
        << "    -f, --ref-fasta FILE    Reference fasta (required if VCF contains symbolic deletions or inversions)" << endl
@@ -55,6 +56,7 @@ int main_call(int argc, char** argv) {
     vector<size_t> ref_path_lengths;
     string min_support_string;
     string bias_string;
+    bool ratio_caller = false;
     
     int c;
     optind = 2; // force optind past command positional argument
@@ -62,6 +64,7 @@ int main_call(int argc, char** argv) {
 
         static const struct option long_options[] = {
             {"pack", required_argument, 0, 'k'},
+            {"bias-mode", no_argument, 0, 'B'},
             {"het-bias", required_argument, 0, 'b'},
             {"min-support", required_argument, 0, 'm'},
             {"vcf", required_argument, 0, 'v'},
@@ -79,7 +82,7 @@ int main_call(int argc, char** argv) {
 
         int option_index = 0;
 
-        c = getopt_long (argc, argv, "k:b:m:v:f:i:s:r:p:o:l:t:h",
+        c = getopt_long (argc, argv, "k:Bb:m:v:f:i:s:r:p:o:l:t:h",
                          long_options, &option_index);
 
         // Detect the end of the options.
@@ -91,6 +94,9 @@ int main_call(int argc, char** argv) {
         case 'k':
             pack_filename = optarg;
             break;
+        case 'B':
+            ratio_caller = true;
+            break;
         case 'b':
             bias_string = optarg;
             break;
@@ -218,6 +224,11 @@ int main_call(int argc, char** argv) {
         cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl;
         return 1;
     }
+    // Check bias option
+    if (!bias_string.empty() && !ratio_caller) {
+        cerr << "error [vg call]: -b can only be used with -B" << endl;
+        return 1;
+    }
 
     // No paths specified: use them all
     if (ref_paths.empty()) {
@@ -245,6 +256,7 @@ int main_call(int argc, char** argv) {
     
     unique_ptr<GraphCaller> graph_caller;
     unique_ptr<SnarlCaller> snarl_caller;
+    algorithms::BinnedDepthIndex depth_index;
 
     // Make a Packed Support Caller
     unique_ptr<Packer> packer;
@@ -256,14 +268,27 @@ int main_call(int argc, char** argv) {
         // Make a packed traversal support finder
         PackedTraversalSupportFinder* packed_support_finder = new PackedTraversalSupportFinder(*packer, *snarl_manager);
         support_finder = unique_ptr<TraversalSupportFinder>(packed_support_finder);
-        // Make a support caller
-        SupportBasedSnarlCaller* packed_caller = new SupportBasedSnarlCaller(*graph, *snarl_manager, *packed_support_finder);
-        if (het_bias >= 0) {
-            packed_caller->set_het_bias(het_bias, ref_het_bias);
+        
+        SupportBasedSnarlCaller* packed_caller = nullptr;
+
+        if (ratio_caller == false) {
+            // Make a depth index
+            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 1000000, 0, true, true);
+            // Make a new-stype probablistic caller
+            auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index);
+            packed_caller = poisson_caller;
+        } else {
+            // Make an old-style ratio support caller
+            auto ratio_caller = new RatioSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder);
+            if (het_bias >= 0) {
+                ratio_caller->set_het_bias(het_bias, ref_het_bias);
+            }
+            packed_caller = ratio_caller;
         }
         if (min_allele_support >= 0) {
             packed_caller->set_min_supports(min_allele_support, min_allele_support, min_site_support);
         }
+        
         snarl_caller = unique_ptr<SnarlCaller>(packed_caller);
     }
 
diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp
index 9372a995a73..e3fed43860a 100644
--- a/src/traversal_support.cpp
+++ b/src/traversal_support.cpp
@@ -60,15 +60,17 @@ tuple<Support, Support, int> TraversalSupportFinder::get_child_support(const Sna
 
 
 Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& traversal) const {
-    return get_traversal_set_support({traversal}, {}, false, false).at(0);
+    return get_traversal_set_support({traversal}, {}, false, false, false).at(0);
 }
 
 vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<SnarlTraversal>& traversals,
-                                                                   const vector<int>& shared_travs,
-                                                                   bool exclusive_only,
-                                                                   bool exclusive_count,
-                                                                   int ref_trav_idx) const {
-
+                                                                  const vector<int>& shared_travs,
+                                                                  bool exclusive_only,
+                                                                  bool exclusive_count,
+                                                                  bool unique,
+                                                                  int ref_trav_idx) const {
+    assert(!unique || (exclusive_count || exclusive_only));
+    
     // pass 1: how many times have we seen a node or edge
     unordered_map<id_t, int> node_counts;
     unordered_map<edge_t, int> edge_counts;
@@ -169,12 +171,16 @@ vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<S
                 length = graph.get_length(graph.get_handle(visit.node_id()));
                 if (node_counts.count(visit.node_id())) {
                     share_count = node_counts[visit.node_id()];
+                } else if (unique) {
+                    node_counts[visit.node_id()] = 1;
                 }
             } else {
                 // get the child support
                 tie(min_support, avg_support, length) = get_child_support(visit.snarl());
                 if (child_counts.count(visit.snarl())) {
                     share_count = child_counts[visit.snarl()];
+                } else if (unique) {
+                    child_counts[visit.snarl()] = 1;
                 }
             }
             if (count_end_nodes || (visit_idx > 0 && visit_idx < trav.visit_size() - 1)) {
@@ -188,7 +194,9 @@ vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<S
                 min_support = get_edge_support(edge);
                 length = get_edge_length(edge, ref_offsets);
                 if (edge_counts.count(edge)) {
-                    share_count = edge_counts[edge];
+                    share_count = edge_counts[edge];                    
+                } else if (unique) {
+                    edge_counts[edge] = 1;
                 }
                 update_support(trav_idx, min_support, min_support, length, share_count);
             }
diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp
index 1b910ed3a3c..aecdddc4807 100644
--- a/src/traversal_support.hpp
+++ b/src/traversal_support.hpp
@@ -1,5 +1,5 @@
-#ifndef VG_SUPPORT_FINDER_HPP_INCLUDED
-#define VG_SUPPORT_FINDER_HPP_INCLUDED
+#ifndef VG_TRAVERSAL_SUPPORT_HPP_INCLUDED
+#define VG_TRAVERSAL_SUPPORT_HPP_INCLUDED
 
 #include <iostream>
 #include <algorithm>
@@ -52,10 +52,13 @@ class TraversalSupportFinder {
     /// exclusive_count is like exclusive only except shared traversals will be counted (as 0)
     /// when doing average and min support
     /// if the ref_trav_idx is given, it will be used for computing (deletion) edge lengths
+    /// if unique is true, then every node or edge will only be counted once
+    /// (useful for total support)
     virtual vector<Support> get_traversal_set_support(const vector<SnarlTraversal>& traversals,
                                                       const vector<int>& shared_travs,
                                                       bool exclusive_only,
                                                       bool exclusive_count,
+                                                      bool unique,
                                                       int ref_trav_idx = -1) const;
 
     /// Get the total length of all nodes in the traversal

From e7c5bca5b0e661e46f87189399bd166e60968dc3 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 8 Nov 2019 16:24:01 -0500
Subject: [PATCH 31/79] bug fixes. more aggressive exclusive support filtering

---
 src/algorithms/coverage_depth.cpp |  5 ++++-
 src/graph_caller.cpp              |  2 +-
 src/snarl_caller.cpp              | 17 ++++++++++++++---
 src/subcommand/call_main.cpp      |  2 +-
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index 0d63f656cb9..bd46c1aabc5 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -157,7 +157,10 @@ unordered_map<string, map<size_t, pair<double, double>>> binned_packed_depth_ind
 
 const pair<double, double>& get_depth_from_index(const unordered_map<string, map<size_t, pair<double, double>>>& depth_index,
                                           const string& path_name, size_t offset) {
-    return depth_index.at(path_name).lower_bound(offset)->second;
+
+    auto ub = depth_index.at(path_name).upper_bound(offset);
+    --ub;
+    return ub->second;
 }
 
 // draw (roughly) max_nodes nodes from the graph using the random seed
diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index 55e8d097759..22622e475e0 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -241,7 +241,7 @@ tuple<string, size_t, size_t> VCFGenotyper::get_ref_positions(const vector<vcfli
     map<string, pair<size_t, size_t>> path_offsets;
     for (const vcflib::Variant* var : variants) {
         if (path_offsets.count(var->sequenceName)) {
-            pair<size_t, size_t>& record = path_offsets[var->ref];
+            pair<size_t, size_t>& record = path_offsets[var->sequenceName];
             record.first = std::min((size_t)var->position, record.first);
             record.second = std::max((size_t)var->position + var->ref.length(), record.second);
         } else {
diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 77393dce867..5dfba9dee81 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -491,11 +491,18 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     // the candidate genotypes and their supports.  the numbers here are alleles as indexed in traversals[]
     map<vector<int>, vector<Support>> candidates;
 
+    // pre-filter out some alleles based on poor exclusive support
+    set<int> skips;
+
     // consider each of the top 25 traversals as our top_traversal
     for (int i = 0; i < max_trav; ++i) {
         
         int best_allele = ranked_traversals[i];
 
+        if (skips.count(best_allele)) {
+            continue;
+        }
+
         if (ploidy == 1) {
             candidates[{best_allele}] = {supports[best_allele]}; 
         } else {
@@ -504,7 +511,6 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
             // we prune out traversals whose exclusive support (structure that is not shared with best traversal)
             // doesn't meet a certain cutoff
             vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx);
-            set<int> skips = {best_allele};
             for (int j = 0; j < secondary_exclusive_supports.size(); ++j) {
                 if (j != best_allele && support_val(secondary_exclusive_supports[j]) <= min_total_support_for_call) {
                     skips.insert(j);
@@ -562,6 +568,9 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
         }
     }
 
+#ifdef debug
+    cerr << " best genotype: "; for (auto a : best_genotype) {cerr << a <<",";} cerr << " gl=" << best_genotype_likelihood << endl;
+#endif
     return best_genotype;
 }
 
@@ -746,8 +755,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
 }
 
 void PoissonSupportSnarlCaller::update_vcf_header(string& header) const {
-
-
+    header += "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n";
+    header += "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n";
+    header += "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n";
+    header += "##FORMAT=<ID=GL,Number=G,Type=Float,Description=\"Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy\">\n";
 }
 
 vector<int> PoissonSupportSnarlCaller::rank_by_support(const vector<Support>& supports) {
diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index fbe049486a4..3f69185b0c9 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -273,7 +273,7 @@ int main_call(int argc, char** argv) {
 
         if (ratio_caller == false) {
             // Make a depth index
-            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 1000000, 0, true, true);
+            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true);
             // Make a new-stype probablistic caller
             auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index);
             packed_caller = poisson_caller;

From faa15ecaceeccecdee261698936a093d580cb7d7 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 8 Nov 2019 16:57:50 -0500
Subject: [PATCH 32/79] another exclusive support bug

---
 src/snarl_caller.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 5dfba9dee81..031e379dee1 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -512,7 +512,9 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
             // doesn't meet a certain cutoff
             vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx);
             for (int j = 0; j < secondary_exclusive_supports.size(); ++j) {
-                if (j != best_allele && support_val(secondary_exclusive_supports[j]) <= min_total_support_for_call) {
+                if (j != best_allele &&
+                    support_val(secondary_exclusive_supports[j]) < min_total_support_for_call &&
+                    support_val(secondary_exclusive_supports[j]) < support_val(supports[j])) {
                     skips.insert(j);
                 }
             }

From d2705eb3a15a3a47df86b598c546d914c310de1c Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Fri, 8 Nov 2019 14:38:13 -0800
Subject: [PATCH 33/79] Passed random unit tests

---
 src/seed_clusterer.cpp          | 52 +++++++++++++++------------------
 src/seed_clusterer.hpp          | 23 +++++++--------
 src/unittest/seed_clusterer.cpp |  3 +-
 3 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index 31fe6e6b7b7..f0119d2cd34 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -2,7 +2,7 @@
 
 #include <algorithm>
 
-#define DEBUG_CLUSTER
+//#define DEBUG_CLUSTER
 
 namespace vg {
 
@@ -12,14 +12,14 @@ namespace vg {
 
     SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (vector<pos_t> seeds, int64_t read_distance_limit) const {
         vector<vector<pos_t>> all_seeds;
-        all_seeds.push_back(std::move(seeds));
-        tuple<vector<vector<vector<size_t>>>,vector<vector<size_t>>> all_clusters =
+        all_seeds.push_back(seeds);
+        tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> all_clusters =
             cluster_seeds(all_seeds, read_distance_limit, 0);
         return std::get<0>(all_clusters)[0];
     };
 
     tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds (
-                  vector<vector<pos_t>> all_seeds, int64_t read_distance_limit,
+                  vector<vector<pos_t>>& all_seeds, int64_t read_distance_limit,
                   int64_t fragment_distance_limit) const {
         /* Given a vector of seeds and a limit, find a clustering of seeds where
          * seeds that are closer than the limit cluster together.
@@ -122,7 +122,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             cerr << endl;
         }
 #endif
-        vector<vector<vector<size_t>>> read_clusters;
+        vector<SnarlSeedClusterer::cluster_group_t> read_clusters;
         for (auto& uf : tree_state.read_union_find) {
             read_clusters.emplace_back(uf.all_groups());
         }
@@ -320,7 +320,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) {
 
                     size_t group_id = seed_range_start->second;
-                    if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
 
                     for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
                         //For each seed on this node, add it to the cluster
@@ -344,6 +343,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
 
                         tree_state.read_union_find[read_num].union_groups(group_id, iter->second);
                         if (tree_state.fragment_distance_limit != 0 ) {
+                            if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
                             tree_state.fragment_union_find.union_groups(fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]);
                         }
 
@@ -444,8 +444,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             size_t read_num = std::get<0>(s);
 
             if (read_last_offset[read_num] != -1 &&
-                abs(std::get<2>(s) - read_last_offset[read_num]) <= tree_state.read_distance_limit) {
-                //TODO: Need abs?
+                std::get<2>(s) - read_last_offset[read_num] <= tree_state.read_distance_limit) {
                 //If this seed is in the same read cluster as the previous one,
                 //union them
 
@@ -474,7 +473,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                         make_pair(read_last_offset[read_num], node_length - read_last_offset[read_num] + 1);
                 if (tree_state.fragment_distance_limit != 0) {
                     if (fragment_last_offset != -1 &&
-                        abs(std::get<2>(s) - fragment_last_offset) <= tree_state.fragment_distance_limit) {
+                        std::get<2>(s) - fragment_last_offset <= tree_state.fragment_distance_limit) {
                         //If this is a new read cluster but the same fragment cluster
                         tree_state.fragment_union_find.union_groups(std::get<1>(s)+tree_state.read_index_offsets[read_num], fragment_last_cluster);
                         fragment_last_cluster = tree_state.fragment_union_find.find_group(fragment_last_cluster);
@@ -619,7 +618,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 fragment_combined_group = tree_state.fragment_union_find.find_group(
                                                      new_group + tree_state.read_index_offsets[read_num]);
             }
-            cerr << endl;
             return;
         };
         //The clusters of the chain that are built from the snarl clusters
@@ -1331,7 +1329,6 @@ cerr << "  Combining this cluster from the right" << endl;
                 pair<int64_t, int64_t> dists_c = tree_state.read_cluster_dists[child_cluster_head.first][child_cluster_head.second];
                 old_dists[child_cluster_head] = dists_c;
 
-                //TODO: Do this only once
                 pair<int64_t, int64_t> new_dists = snarl_index.distToEnds(node_rank,
                                         dists_c.first,dists_c.second);
 #ifdef DEBUG_CLUSTER
@@ -1431,35 +1428,35 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                             //from the left of both of them
                             int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 :
                                   dist_l_l + dists_c.first + other_node_clusters.read_best_left[read_num]-1;
+                            int64_t fragment_dist = dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1;
                             combine_clusters(c_group, group_l_l[read_num], fragment_group_l_l,
-                                  dist_l_l + dists_c.first + other_node_clusters.fragment_best_left-1,
-                                  read_dist,  read_num);
+                                  fragment_dist, read_dist,  read_num);
                         }
 
                         if (dist_l_r != -1 && dists_c.first != -1
                             && other_node_clusters.fragment_best_right != -1 ) {
                             //If it can be combined from the left to the right of j
+                            int64_t fragment_dist = dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1;
                             int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 :
                                  dist_l_r + dists_c.first + other_node_clusters.read_best_right[read_num]-1;
                             combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r,
-                                 dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1,
-                                 read_dist, read_num);
+                                 fragment_dist, read_dist, read_num);
                         }
                         if (dist_r_l != -1 && dists_c.second != -1
                             && other_node_clusters.fragment_best_left != -1 ) {
+                            int64_t fragment_dist = dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1;
                             int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 :
                                 dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1;
                             combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l,
-                                dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1,
-                                read_dist,  read_num);
+                                fragment_dist, read_dist,  read_num);
                         }
                         if (dist_r_r != -1 && dists_c.second != -1
                             && other_node_clusters.fragment_best_right != -1 ) {
+                            int64_t fragment_dist = dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1;
                             int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 :
                                 dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1;
                             combine_clusters(c_group, group_r_r[read_num], fragment_group_r_r,
-                                dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1,
-                                read_dist, read_num);
+                                fragment_dist, read_dist, read_num);
                         }
 
                     }
@@ -1484,39 +1481,38 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                         if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1
                            && dists_k.first != -1 ){
 
+                            int64_t fragment_dist = dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1;
                             int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 :
                                 dist_l_l + curr_child_clusters.read_best_left[read_num] + dists_k.first-1;
                             combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l,
-                                dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1,
-                                read_dist, read_num);
+                                fragment_dist,read_dist, read_num);
                         }
                         if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1
                              && dists_k.second != -1  ) {
 
                             int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 :
                                dist_l_r + curr_child_clusters.read_best_left[read_num] + dists_k.second-1;
-
+                            int64_t fragment_dist = dist_l_r + curr_child_clusters.fragment_best_left + dists_k.second-1;
                             combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r,
-                               dist_l_r + curr_child_clusters.fragment_best_left + dists_k.second-1, 
-                               read_dist, read_num);
+                               fragment_dist, read_dist, read_num);
                         }
                         if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1
                             && dists_k.first != -1  ) {
 
+                            int64_t fragment_dist = dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1;
                             int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 :
                                 dist_r_l + curr_child_clusters.read_best_right[read_num] + dists_k.first-1;
                             combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l,
-                                dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1,
-                                read_dist, read_num);
+                                fragment_dist, read_dist, read_num);
                         }
                         if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1
                            && dists_k.second != -1 ) {
 
+                            int64_t fragment_dist = dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1;
                             int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 :
                                dist_r_r + curr_child_clusters.read_best_right[read_num] + dists_k.second-1;
                             combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r,
-                               dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1, 
-                               read_dist, read_num);
+                               fragment_dist, read_dist, read_num);
                         }
                     }
                 }
diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp
index f95f0261ae9..92c6b604f46 100644
--- a/src/seed_clusterer.hpp
+++ b/src/seed_clusterer.hpp
@@ -14,27 +14,26 @@ class SnarlSeedClusterer {
 
         SnarlSeedClusterer(MinimumDistanceIndex& dist_index);
 
+        //Represents all clusters for one vector of seeds
+        //Each cluster is a vector of indexes into the vector of seeds 
         typedef vector<vector<size_t>> cluster_group_t;
 
         ///Given a vector of seeds (pos_t) and a distance limit, 
         //cluster the seeds such that two seeds whose minimum distance
         //between them (including both of the positions) is less than
         // the distance limit are in the same cluster
-        //
-        //Returns a vector of clusters. Each cluster is a vector of
-        //indices into seeds
         cluster_group_t cluster_seeds ( vector<pos_t> seeds, int64_t read_distance_limit) const;
         
         ///The same thing, but for paired end reads.
-        //Given seeds from multiple reads of a fragment, cluster each set of seeds
-        //by the read distance and all seeds by the fragment distance limit
+        //Given seeds from multiple reads of a fragment, cluster each read
+        //by the read distance and all seeds by the fragment distance limit.
         //fragment_distance_limit must be greater than read_distance_limit
         //Returns clusters for each read and clusters of all the seeds in all reads
         //The read clusters refer to seeds by their indexes in the input vectors of seeds
         //The fragment clusters give seeds the index they would get if the vectors of
         // seeds were appended to each other in the order given
         tuple<vector<cluster_group_t>, cluster_group_t> cluster_seeds ( 
-                vector<vector<pos_t>> all_seeds,
+                vector<vector<pos_t>>& all_seeds,
                 int64_t read_distance_limit, int64_t fragment_distance_limit=0) const;
 
     private:
@@ -137,7 +136,8 @@ class SnarlSeedClusterer {
             //Vector of all the seeds for each read
             vector<vector<pos_t>>* all_seeds; 
 
-            //Vector of the offset of indices for each seed
+            //prefix sum vector of the number of seeds per read
+            //To get the index of a seed for the fragment clusters
             vector<size_t> read_index_offsets;
 
             //The minimum distance between nodes for them to be put in the
@@ -191,7 +191,8 @@ class SnarlSeedClusterer {
             hash_map<size_t,vector<pair<NetgraphNode,NodeClusters>>>
                                                           parent_snarl_to_nodes;
 
-            //Constructor takes in a pointer to the seeds and the distance limit 
+            //Constructor takes in a pointer to the seeds, the distance limits, and 
+            //the total number of seeds in all_seeds
             TreeState (vector<vector<pos_t>>* all_seeds, int64_t read_distance_limit, 
                        int64_t fragment_distance_limit, size_t seed_count) :
                 all_seeds(all_seeds),
@@ -200,9 +201,7 @@ class SnarlSeedClusterer {
                 fragment_union_find (seed_count, false),
                 read_index_offsets(1,0){
 
-                size_t total_seeds = 0;
                 for (vector<pos_t>& v : *all_seeds) {
-                    total_seeds += v.size();
                     size_t offset = read_index_offsets.back() + v.size();
                     read_index_offsets.push_back(offset);
                     read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1));
@@ -231,8 +230,7 @@ class SnarlSeedClusterer {
         //Cluster all the chains at the current level
         void cluster_chains(TreeState& tree_state, size_t depth) const;
 
-        //Given a node and the indices of seeds on that node, root, 
-        //cluster the seeds
+        //Cluster the seeds on the specified node
         NodeClusters cluster_one_node(TreeState& tree_state, 
                                           id_t node_id, int64_t node_length) const; 
 
@@ -243,7 +241,6 @@ class SnarlSeedClusterer {
 
         //Cluster the seeds in a snarl given by snarl_index_i, an index into
         //dist_index.snarl_indexes
-        //rev is true if this snarl is reversed in its parent
         NodeClusters cluster_one_snarl(TreeState& tree_state,
                                        size_t snarl_index_i) const;
 
diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp
index 8e107e8e99d..e6fec99233c 100644
--- a/src/unittest/seed_clusterer.cpp
+++ b/src/unittest/seed_clusterer.cpp
@@ -845,7 +845,7 @@ namespace unittest {
 
             uniform_int_distribution<int> randSnarlIndex(0, allSnarls.size()-1);
             default_random_engine generator(time(NULL));
-            for (size_t k = 0; k < 100 ; k++) {
+            for (size_t k = 0; k < 1000 ; k++) {
                 vector<vector<pos_t>> all_seeds;
                 all_seeds.emplace_back();
                 all_seeds.emplace_back();
@@ -892,7 +892,6 @@ namespace unittest {
                         for (size_t a = 0; a < one_read_clusters.size(); a++) {
                             // For each cluster -cluster this cluster to ensure that 
                             // there is only one
-                            cerr << a << " of " << one_read_clusters.size() << endl;
                             vector<size_t> clust = one_read_clusters[a];
                             
                             structures::UnionFind new_clusters (clust.size(), false);

From 820ad0fd174e56c648173a1cc70823e4543feb54 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 8 Nov 2019 21:54:28 -0500
Subject: [PATCH 34/79] fix bugs to get bash tap tests going

---
 src/algorithms/coverage_depth.cpp |  2 +-
 src/snarl_caller.cpp              | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index bd46c1aabc5..6d28335f4d7 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -146,7 +146,7 @@ unordered_map<string, map<size_t, pair<double, double>>> binned_packed_depth_ind
             double var = get<3>(binned_depth);
             // optionally convert variance to standard error
             if (std_err) {
-                var = sqrt(var / (double)(get<1>(binned_depth) - get<2>(binned_depth)));
+                var = sqrt(var / (double)(get<1>(binned_depth) - get<0>(binned_depth)));
             }
             depth_map[get<0>(binned_depth)] = make_pair(get<2>(binned_depth), var);
         }
diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 031e379dee1..5ca35c7c2a0 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -610,7 +610,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     }
 
     // how many reads would we expect to not map to our genotype due to error
-    double error_rate = std::min(0.95, depth_err + baseline_mapping_error);
+    double error_rate = std::min(0.25, depth_err + baseline_mapping_error);
     double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support);
 
     // and our likelihood for the unmapped reads we see:
@@ -733,8 +733,13 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
             }
             double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err);
             // convert from natural log to log10 by dividing by ln(10)
-            gl /= 2.30258;
-            variant.samples[sample_name]["GL"].push_back(std::to_string(gl));
+            variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258));
+
+            // use our likelihood as the VCF quality
+            // todo: check if there's something more conventional to use
+            if ((genotype[0] == i && genotype[1] == j) || (genotype[0] == j && genotype[1] == i)) {
+                variant.quality = logprob_to_phred(gl);
+            }
         }
     }
 

From 1eeb6af52bf2199f1925cc47a5cdbd8339c74c2c Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 8 Nov 2019 21:58:46 -0500
Subject: [PATCH 35/79] turn off debug output

---
 src/snarl_caller.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 5ca35c7c2a0..d22171ba617 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -1,7 +1,7 @@
 #include "snarl_caller.hpp"
 #include "genotypekit.hpp"
 
-#define debug
+//#define debug
 
 namespace vg {
 

From d7fb34e700ab63478b3a6581dfbfbef79340968c Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 11 Nov 2019 16:14:42 -0500
Subject: [PATCH 36/79] add function for ewens sampling probability

---
 src/distributions.hpp | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/distributions.hpp b/src/distributions.hpp
index fb5a4b3cdb4..6543cacc0db 100644
--- a/src/distributions.hpp
+++ b/src/distributions.hpp
@@ -785,6 +785,38 @@ class discrete_distribution {
 
 };
 
+// ewen's allele sampling distribution.  for use in genotype prior (as in freebayes)
+// gives Pr(a1, ...,an;theta) where ai is the number of sampled haplotypes (out of n) that
+// have i different alleles at a given locus. theta is the population mutation rate. 
+// ex: for a single diploid genotype, a={2,0} = heterozygous: 2 alleles occur once.
+//                                    a={0,1} = homozygous: 1 allele occurs twice.
+//
+// https://en.wikipedia.org/wiki/Ewens%27s_sampling_formula
+// https://github.com/ekg/freebayes/blob/master/src/Ewens.cpp#L17
+inline real_t ewens_af_prob_ln(const vector<int>& a, real_t theta) {
+
+    // first term (wrt formula as stated on wikipedia)
+    // n! / (theta * (theta + 1) * ... (theta + n - 1))
+    real_t term1_num_ln = factorial_ln(a.size());
+    real_t term1_denom_ln = 0.;
+    for (int i = 0; i < a.size(); ++i) {
+        term1_denom_ln += log(theta + i);
+    }
+    real_t term1_ln = term1_num_ln - term1_denom_ln;
+
+    // second term
+    // prod [ (theta^aj) / (j^aj * aj!) 
+    real_t term2_ln = 0.;
+    for (int j = 0; j < a.size(); ++j) {
+        real_t num = log(pow(theta, a[j]));
+        real_t denom = log(pow(1. + j, a[j]) + factorial_ln(a[j]));
+        term2_ln += num - denom;
+    }
+
+    return term1_ln + term2_ln;
+}
+
+
 }
 
 #endif

From fd5b96731d4502817366baef5e3c5555c1063899 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 11 Nov 2019 16:17:20 -0500
Subject: [PATCH 37/79] let poisson caller use min-ad and het-bias from old
 caller for now (hopefully temporary)

---
 src/snarl_caller.cpp         | 131 +++++++++++++++++++++--------------
 src/snarl_caller.hpp         |  38 +++++-----
 src/subcommand/call_main.cpp |  13 ++--
 3 files changed, 103 insertions(+), 79 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index d22171ba617..475b6454666 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -54,6 +54,19 @@ void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double m
     }
 }
 
+void SupportBasedSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) {
+    // want to move away from ugly hacks that treat the reference traversal differently,
+    // so keep all these set the same
+    if (het_bias >= 0) {
+        max_het_bias = het_bias;
+        max_ref_het_bias = het_bias;
+        max_indel_het_bias = het_bias;
+    }
+    if (ref_het_bias >= 0) {
+        max_ref_het_bias = ref_het_bias;
+    }
+}
+
 int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, const vector<int>& skips) {
     int best_allele = -1;
     for(size_t i = 0; i < supports.size(); i++) {
@@ -65,6 +78,38 @@ int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, c
     return best_allele;
 }
 
+double SupportBasedSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
+                                         int second_best_trav, int ref_trav_idx) const {
+    bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) ||
+                     (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx]));
+
+    double bias_limit = 1;
+
+    if (best_trav >= 0 && second_best_trav >=0) {
+        if (best_trav == ref_trav_idx) {
+            // Use ref bias limit
+            
+            // We decide closeness differently depending on whether best is ref
+            // or not. In practice, we use this to slightly penalize homozygous
+            // ref calls (by setting max_ref_het_bias higher than max_het_bias)
+            // and rather make a less supported alt call instead.  This boost
+            // max sensitivity, and because everything is homozygous ref by
+            // default in VCF, any downstream filters will effectively reset
+            // these calls back to homozygous ref. TODO: This shouldn't apply
+            // when off the primary path!
+            bias_limit = max_ref_het_bias;
+        } else if (is_indel) {
+            // This is an indel
+            // Use indel bias limit
+            bias_limit = max_indel_het_bias;
+        } else {
+            // Use normal het bias limit
+            bias_limit = max_het_bias;
+        }
+    }
+    return bias_limit;
+}
+
 RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
                                                  TraversalSupportFinder& support_finder) :
     SupportBasedSnarlCaller(graph, snarl_manager, support_finder)  {
@@ -74,19 +119,6 @@ RatioSupportSnarlCaller::~RatioSupportSnarlCaller() {
     
 }
 
-void RatioSupportSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) {
-    // want to move away from ugly hacks that treat the reference traversal differently,
-    // so keep all these set the same
-    if (het_bias >= 0) {
-        max_het_bias = het_bias;
-        max_ref_het_bias = het_bias;
-        max_indel_het_bias = het_bias;
-    }
-    if (ref_het_bias >= 0) {
-        max_ref_het_bias = ref_het_bias;
-    }
-}
-
 vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
                                               const vector<SnarlTraversal>& traversals,
                                               int ref_trav_idx,
@@ -418,39 +450,6 @@ function<bool(const SnarlTraversal&)> RatioSupportSnarlCaller::get_skip_allele_f
     };
 }
 
-double RatioSupportSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
-                                         int second_best_trav, int ref_trav_idx) const {
-    bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) ||
-                     (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx]));
-
-    double bias_limit = 1;
-
-    if (best_trav >= 0 && second_best_trav >=0) {
-        if (best_trav == ref_trav_idx) {
-            // Use ref bias limit
-            
-            // We decide closeness differently depending on whether best is ref
-            // or not. In practice, we use this to slightly penalize homozygous
-            // ref calls (by setting max_ref_het_bias higher than max_het_bias)
-            // and rather make a less supported alt call instead.  This boost
-            // max sensitivity, and because everything is homozygous ref by
-            // default in VCF, any downstream filters will effectively reset
-            // these calls back to homozygous ref. TODO: This shouldn't apply
-            // when off the primary path!
-            bias_limit = max_ref_het_bias;
-        } else if (is_indel) {
-            // This is an indel
-            // Use indel bias limit
-            bias_limit = max_indel_het_bias;
-        } else {
-            // Use normal het bias limit
-            bias_limit = max_het_bias;
-        }
-    }
-    return bias_limit;
-}
-
-
 PoissonSupportSnarlCaller::PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
                                                      TraversalSupportFinder& support_finder,
                                                      const algorithms::BinnedDepthIndex& depth_index) :
@@ -563,7 +562,7 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     double best_genotype_likelihood = -numeric_limits<double>::max();
     vector<int> best_genotype;
     for (const auto& candidate : candidates) {
-        double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err);
+        double gl = genotype_likelihood(candidate.first, candidate.second, traversals, traversal_sizes, ref_trav_idx, exp_depth, depth_err);
         if (gl > best_genotype_likelihood) {
             best_genotype_likelihood = gl;
             best_genotype = candidate.first;
@@ -579,6 +578,7 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
 double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotype,
                                                       const vector<Support>& genotype_supports,
                                                       const vector<SnarlTraversal>& traversals,
+                                                      const vector<int>& traversal_sizes,
                                                       int ref_trav_idx, double exp_depth, double depth_err) {
     
     assert(genotype_supports.size() == genotype.size());
@@ -602,6 +602,24 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
             fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size();
         }
     }
+
+    // we preserve our het-bias from RatioSupportSnarlCaller as something prior-like here
+    // todo: use something better
+    double het_prior = 0.;
+    double ew_prior = 0.;
+    if (genotype.size() == 2) {
+        double het_bias = get_bias(traversal_sizes, genotype[0], genotype[1], ref_trav_idx);
+        if (genotype[0] != genotype[1]) {
+            // if the het_bias is greater than 1 (usually it's 6 by default), then
+            // we get a prior of 5/6 for a het
+            het_prior = log(1. - 1. / het_bias);
+            ew_prior = ewens_af_prob_ln({2, 0}, 0.001);
+        } else {
+            // and 1/6 for a hom
+            het_prior = log(1. / het_bias);
+            ew_prior = ewens_af_prob_ln({0, 1}, 0.001);
+        }
+    }    
     
     // total support of the site
     Support total_site_support = total_other_support;
@@ -643,11 +661,12 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     }
 
 #ifdef debug
-    cerr  << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood
-          << " total-prob " << (alleles_log_likelihood + other_log_likelihood) << endl;
+    cerr  << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood << " prior " << het_prior
+          << " ew prior " << ew_prior
+          << " total-prob " << (alleles_log_likelihood + other_log_likelihood + het_prior) << endl;
 #endif
 
-    return alleles_log_likelihood + other_log_likelihood;
+    return alleles_log_likelihood + other_log_likelihood + het_prior;
 }
 
 void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
@@ -668,6 +687,8 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     }
     double total_site_depth = support_val(site_support);
 
+    vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
+
     // Set the variant's total depth            
     string depth_string = std::to_string((int64_t)round(total_site_depth));
     variant.format.push_back("DP");
@@ -679,6 +700,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     // get the allele depths
     variant.format.push_back("AD");
     set<int> called_allele_set(genotype.begin(), genotype.end());
+    double min_site_support = genotype.size() > 0 ? INFINITY : 0;
 
     for (int i = 0; i < traversals.size(); ++i) {
         vector<int> shared_travs;
@@ -698,6 +720,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         // there is certainly room for optimization via remembering some of this stuff here
         vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false);
         variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i]))));
+        if (in_genotype) {
+            // update the minimum support
+            min_site_support = min(min_site_support, total(allele_supports[i]));
+        }
     }
 
     // get the genotype likelihoods
@@ -731,7 +757,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
                 gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false);
                 genotype_supports.push_back(gt_supports[j]);
             }
-            double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err);
+            double gl = genotype_likelihood({i, j}, genotype_supports, traversals, traversal_sizes, 0, exp_depth, depth_err);
             // convert from natural log to log10 by dividing by ln(10)
             variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258));
 
@@ -743,6 +769,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         }
     }
 
+    // use old quality for now
+    variant.quality = min_site_support;
+
     // todo
     /*
     // Now do the filters
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index 1c3deace343..e2d7aa78f5c 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -75,6 +75,8 @@ class SupportBasedSnarlCaller : public SnarlCaller {
 
     /// Set some of the parameters
     void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support);
+
+    void set_het_bias(double het_bias, double ref_het_bias = 0.);
     
     /// Get the traversal support finder
     const TraversalSupportFinder& get_support_finder() const;
@@ -90,6 +92,12 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// Relic from old code
     static double support_val(const Support& support) { return total(support); };
 
+    /// Get the bias used to for comparing two traversals
+    /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel
+    ///  see tuning parameters below)
+    double get_bias(const vector<int>& traversal_sizes, int best_trav,
+                    int second_best_trav, int ref_trav_idx) const;
+
     const PathHandleGraph& graph;
 
     SnarlManager& snarl_manager;    
@@ -107,6 +115,15 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// what's the minimum total support (over all alleles) of the site to make
     /// a call
     size_t min_site_depth = 3;
+    /// What fraction of the reads supporting an alt are we willing to discount?
+    /// At 2, if twice the reads support one allele as the other, we'll call
+    /// homozygous instead of heterozygous. At infinity, every call will be
+    /// heterozygous if even one read supports each allele.
+    double max_het_bias = 6;
+    /// Like above, but applied to ref / alt ratio (instead of alt / ref)
+    double max_ref_het_bias = 6;
+    /// Like the max het bias, but applies to novel indels.
+    double max_indel_het_bias = 6;
 };
 
 
@@ -120,9 +137,6 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller {
                             TraversalSupportFinder& support_finder);
     virtual ~RatioSupportSnarlCaller();
 
-    /// Set some of the parameters
-    void set_het_bias(double het_bias, double ref_het_bias = 0.);
-
     /// Get the genotype of a site
     virtual vector<int> genotype(const Snarl& snarl,
                                  const vector<SnarlTraversal>& traversals,
@@ -146,27 +160,12 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller {
 
 protected:
 
-    /// Get the bias used to for comparing two traversals
-    /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel
-    ///  see tuning parameters below)
-    double get_bias(const vector<int>& traversal_sizes, int best_trav,
-                    int second_best_trav, int ref_trav_idx) const;
-
     /// get a map of the beginning of a node (in forward orientation) on a traversal
     /// used for up-weighting large deletion edges in complex snarls with average support
     unordered_map<id_t, size_t> get_ref_offsets(const SnarlTraversal& ref_trav) const;
 
     /// Tuning
 
-    /// What fraction of the reads supporting an alt are we willing to discount?
-    /// At 2, if twice the reads support one allele as the other, we'll call
-    /// homozygous instead of heterozygous. At infinity, every call will be
-    /// heterozygous if even one read supports each allele.
-    double max_het_bias = 6;
-    /// Like above, but applied to ref / alt ratio (instead of alt / ref)
-    double max_ref_het_bias = 6;
-    /// Like the max het bias, but applies to novel indels.
-    double max_indel_het_bias = 6;
     /// Used for calling 1/2 calls.  If both alts (times this bias) are greater than
     /// the reference, the call is made.  set to 0 to deactivate.
     double max_ma_bias = 0;
@@ -218,13 +217,14 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
     double genotype_likelihood(const vector<int>& genotype,
                                const vector<Support>& genotype_supports,
                                const vector<SnarlTraversal>& traversals,
+                               const vector<int>& traversal_sizes,                               
                                int ref_trav_idx, double exp_depth, double depth_err);
 
     /// Rank supports
     vector<int> rank_by_support(const vector<Support>& supports);
 
     /// Baseline mapping error rate (gets added to the standard error from coverage)
-    double baseline_mapping_error = 0.05;
+    double baseline_mapping_error = 0.005;
 
     /// Consider up to the top-k traversals (based on support) for genotyping
     size_t top_k = 25;
diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index 3f69185b0c9..4a611c67303 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -224,11 +224,6 @@ int main_call(int argc, char** argv) {
         cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl;
         return 1;
     }
-    // Check bias option
-    if (!bias_string.empty() && !ratio_caller) {
-        cerr << "error [vg call]: -b can only be used with -B" << endl;
-        return 1;
-    }
 
     // No paths specified: use them all
     if (ref_paths.empty()) {
@@ -273,21 +268,21 @@ int main_call(int argc, char** argv) {
 
         if (ratio_caller == false) {
             // Make a depth index
-            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true);
+            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 50000, 0, true, true);
             // Make a new-stype probablistic caller
             auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index);
             packed_caller = poisson_caller;
         } else {
             // Make an old-style ratio support caller
             auto ratio_caller = new RatioSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder);
-            if (het_bias >= 0) {
-                ratio_caller->set_het_bias(het_bias, ref_het_bias);
-            }
             packed_caller = ratio_caller;
         }
         if (min_allele_support >= 0) {
             packed_caller->set_min_supports(min_allele_support, min_allele_support, min_site_support);
         }
+        if (het_bias >= 0) {
+            packed_caller->set_het_bias(het_bias, ref_het_bias);
+        }
         
         snarl_caller = unique_ptr<SnarlCaller>(packed_caller);
     }

From ed7538a438da0b196482581d60c7e9e84078a1c8 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Mon, 11 Nov 2019 14:01:06 -0800
Subject: [PATCH 38/79] Moved combining chain clusters to a helper

---
 src/seed_clusterer.cpp | 301 +++++++++++++++++++----------------------
 src/seed_clusterer.hpp |   4 +-
 2 files changed, 144 insertions(+), 161 deletions(-)

diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index f0119d2cd34..dd16f19c8c5 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -2,7 +2,7 @@
 
 #include <algorithm>
 
-//#define DEBUG_CLUSTER
+#define DEBUG_CLUSTER
 
 namespace vg {
 
@@ -84,10 +84,10 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             //Cluster all the snarls at this depth
             //Also records which snarls are in chains and the parents of these
             //snarls in tree_state.parent_snarl_to_node
-            cluster_snarls(tree_state, depth);
+            cluster_snarl_level(tree_state, depth);
 
             //And cluster all the chains, record the parents of these chains
-            cluster_chains(tree_state, depth);
+            cluster_chain_level(tree_state, depth);
 
             // Swap buffer over for the next level
             tree_state.snarl_to_nodes = move(tree_state.parent_snarl_to_nodes);
@@ -138,36 +138,33 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
 
     void SnarlSeedClusterer::get_nodes( TreeState& tree_state,
               vector<hash_map<size_t,vector<pair<NetgraphNode, NodeClusters>>>>&
-                                                               snarl_to_nodes_by_level) const {
+                                                 snarl_to_nodes_by_level) const {
 
         // Assign each seed to a node.
+        hash_set<id_t> seen_nodes;
         for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ 
             vector<pos_t>& seeds = tree_state.all_seeds->at(read_num);
             for (size_t i = 0; i < seeds.size(); i++) {
                 id_t id = get_id(seeds.at(i));
+
+                //Assign the seed to a node
                 tree_state.node_to_seeds[read_num].emplace_back(id, i);
-                //For each seed, assign it to a node and the node to a snarl
-            }
-            std::sort(tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end());
-        }
 
-        // Assign each node to a snarl.
-        hash_set<id_t> seen_nodes;
-        for (auto& read_node :tree_state.node_to_seeds) {
-            for (auto& mapping : read_node) {
-                if (seen_nodes.count(mapping.first) < 1) {
-                    seen_nodes.insert( mapping.first);
-                    size_t snarl_i = dist_index.getPrimaryAssignment(mapping.first);
+                //And the node to a snarl
+                if (seen_nodes.count(id) < 1) {
+                    seen_nodes.insert(id);
+                    size_t snarl_i = dist_index.getPrimaryAssignment(id);
                     size_t depth = dist_index.snarl_indexes[snarl_i].depth;
                     snarl_to_nodes_by_level[depth][snarl_i].emplace_back(
-                             NetgraphNode(mapping.first, NODE), NodeClusters(tree_state.all_seeds->size()));
+                             NetgraphNode(id, NODE), NodeClusters(tree_state.all_seeds->size()));
                 }
             }
+            std::sort(tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end());
         }
     }
 
 
-    void SnarlSeedClusterer::cluster_snarls(TreeState& tree_state, size_t depth) const {
+    void SnarlSeedClusterer::cluster_snarl_level(TreeState& tree_state, size_t depth) const {
 
         for (auto& kv : tree_state.snarl_to_nodes){
             //Go through each of the snarls at this level, cluster them,
@@ -192,14 +189,11 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 //If this snarl is in a chain, cluster and add let the
                 //tree state know which chain it belongs to
 
-                size_t chain_assignment = dist_index.getChainAssignment(
-                                                    snarl_index.parent_id);
-                size_t chain_rank = dist_index.getChainRank(
-                                                  snarl_index.id_in_parent);
+                size_t chain_assignment = dist_index.getChainAssignment(snarl_index.parent_id);
+                size_t chain_rank = dist_index.getChainRank(snarl_index.id_in_parent);
 
                 tree_state.chain_to_snarls[chain_assignment].emplace(
-                        chain_rank, make_pair(snarl_i,
-                            cluster_one_snarl(tree_state, snarl_i)));
+                        chain_rank, make_pair(snarl_i, cluster_one_snarl(tree_state, snarl_i)));
 
 #ifdef DEBUG_CLUSTER
                 cerr << "Recording snarl number " << snarl_i << " headed by "
@@ -243,7 +237,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
         }
     }
 
-    void SnarlSeedClusterer::cluster_chains(TreeState& tree_state, size_t depth) const {
+    void SnarlSeedClusterer::cluster_chain_level(TreeState& tree_state, size_t depth) const {
         for (auto& kv : tree_state.chain_to_snarls) {
             //For each chain at this level that has relevant child snarls in it,
             //find the clusters.
@@ -274,7 +268,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
 #endif
                 // Map it to the snarl number that should be represented by it
                 // (and thus also contain the chain)
-                size_t parent_snarl_i =dist_index.getPrimaryAssignment(parent_id);
+                size_t parent_snarl_i = dist_index.getPrimaryAssignment(parent_id);
 
                 // Register clusters as relevant for that parent snarl.
 
@@ -299,12 +293,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
         cerr << "Finding clusters on node " << node_id << " which has length " <<
         node_length << endl;
 #endif
-        /*Find clusters of seeds in this node.
-         * Result contains hash_set of the union find group IDs of the new clusters,
-         * and the shortest distance from any seed to the left and right sides
-         * of the node*/
 
-        //indices of union find group ids of clusters in this node
+        //Final clusters on the node that we will be returning
         NodeClusters node_clusters(tree_state.all_seeds->size());
 
         if (tree_state.read_distance_limit > node_length) {
@@ -314,14 +304,15 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             size_t fragment_group_id = -1;
             for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
                 auto seed_range_start = std::lower_bound(
-                    tree_state.node_to_seeds[read_num].begin(),
-                    tree_state.node_to_seeds[read_num].end(),
+                    tree_state.node_to_seeds[read_num].begin(), tree_state.node_to_seeds[read_num].end(),
                     std::pair<id_t, size_t>(node_id, 0));
-                if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) {
+                if (seed_range_start != tree_state.node_to_seeds[read_num].end() 
+                        && seed_range_start->first == node_id) {
 
                     size_t group_id = seed_range_start->second;
 
-                    for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
+                    for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() 
+                                                      && iter->first == node_id; ++iter) {
                         //For each seed on this node, add it to the cluster
                         //And find the shortest distance from any seed to both
                         //ends of the node
@@ -402,19 +393,22 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             return node_clusters;
         }
 
+
+        //The seeds may for multiple clusters on the node
+        //Sort the seeds by their offset in the node and split into clusters
+            
+        //<index of read, index of seed, offset of seed> for all seeds
         vector<tuple<size_t,size_t, int64_t>> seed_offsets;
         for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++) {
-            //<index of read, index of seed, offset of seed> for all seeds
-                auto seed_range_start = std::lower_bound(
-                    tree_state.node_to_seeds[read_num].begin(),
-                    tree_state.node_to_seeds[read_num].end(),
-                    std::pair<id_t, size_t>(node_id, 0));
-            if (seed_range_start != tree_state.node_to_seeds[read_num].end()) {
-                for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() && iter->first == node_id; ++iter) {
+            auto seed_range_start = std::lower_bound(
+                tree_state.node_to_seeds[read_num].begin(),tree_state.node_to_seeds[read_num].end(),
+                std::pair<id_t, size_t>(node_id, 0));
+            if (seed_range_start != tree_state.node_to_seeds[read_num].end() && seed_range_start->first == node_id) {
+                for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() 
+                                                    && iter->first == node_id; ++iter) {
                     //For each seed, find its offset
                     pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
-                    int64_t offset = is_rev(seed) ? node_length - get_offset(seed)
-                                                    : get_offset(seed) + 1;
+                    int64_t offset = is_rev(seed) ? node_length - get_offset(seed) : get_offset(seed) + 1;
 
                     node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left);
                     node_clusters.fragment_best_right = min_not_minus_one(node_length-offset+1, node_clusters.fragment_best_right);
@@ -465,6 +459,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             } else {
                 //This becomes a new read cluster
                 if (read_last_cluster[read_num] != -1) {
+                    //Record the previous cluster
                     node_clusters.read_cluster_heads.emplace(read_num, read_last_cluster[read_num]);
                 }
                 read_last_cluster[read_num] = std::get<1>(s);
@@ -546,6 +541,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                                TreeState& tree_state, size_t chain_index_i) const {
         /*
          * Find all the clusters in the given chain
+         * Iterate through snarls and create clusters of positions up to that snarl
          */
 
         std::map<size_t, pair<size_t, NodeClusters>>& snarls_in_chain =
@@ -558,6 +554,64 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
              << " headed by node " << chain_index.id_in_parent << endl;
 #endif
 
+        auto combine_chain_clusters = [&] (size_t& cluster_group,
+                        vector<size_t>& combined_group, size_t& fragment_combined_group,
+                        vector<int64_t>& combined_left, vector<int64_t>& combined_right,
+                        pair<int64_t, int64_t>& dists,
+                        vector<pair<size_t,size_t>>& to_erase, int64_t& fragment_dist,int64_t& read_dist,
+                        size_t& read_num){
+            //Compare and combine the given cluster_group with the read and fragment combined cluster 
+            //Update the distances of the read combined cluster, if combined
+            //Returns true if the cluster got combined with a read cluster
+
+            if (read_dist != -1 && read_dist <= tree_state.read_distance_limit){
+                //If this chain cluster's rightmost seed is close enough
+                //to the leftmost seed of any cluster in the snarl, then
+                //this chain cluster is in the combined cluster
+
+                if (combined_group[read_num] == -1) {
+                    //New chain cluster
+                    combined_group[read_num] = cluster_group;
+                    combined_left[read_num] = dists.first;
+                    combined_right[read_num] = dists.second;
+                } else {
+                    //Combine
+                    tree_state.read_union_find[read_num].union_groups(combined_group[read_num], cluster_group);
+                    size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_group);
+                    if (new_group == cluster_group) {
+                        to_erase.emplace_back(read_num,combined_group[read_num]);
+                    } else {
+                        to_erase.emplace_back(read_num, cluster_group);
+                    }
+                    combined_group[read_num] = new_group;
+                    combined_left[read_num] = min_not_minus_one(combined_left[read_num], dists.first);
+                    combined_right[read_num] = min_not_minus_one(combined_right[read_num], dists.second);
+                }
+                cerr << "COMBINING READ: " ;
+                if (tree_state.fragment_distance_limit != 0) {
+                    if (fragment_combined_group != -1) {
+                        tree_state.fragment_union_find.union_groups(fragment_combined_group, 
+                                                                  cluster_group+tree_state.read_index_offsets[read_num]);
+                    }
+                    fragment_combined_group = tree_state.fragment_union_find.find_group(
+                                cluster_group+tree_state.read_index_offsets[read_num]);
+                    cerr << "   AND FRAGMENT" << endl;
+                }
+                return true;
+            } else if (fragment_dist != -1 &&
+                       fragment_dist <= tree_state.fragment_distance_limit) {
+                //If this is a new read cluster but the same fragment cluster
+                if (fragment_combined_group != -1) {
+                    tree_state.fragment_union_find.union_groups(fragment_combined_group, cluster_group+tree_state.read_index_offsets[read_num]);
+                }
+                fragment_combined_group = tree_state.fragment_union_find.find_group(cluster_group+tree_state.read_index_offsets[read_num]);
+                
+                return false;
+            }
+            return false;
+        };
+
+
         auto combine_snarl_clusters = [&] (size_t& new_group,
                         size_t& combined_group, size_t& fragment_combined_group,
                         vector<pair<size_t,size_t>>& to_erase, int64_t fragment_dist,int64_t read_dist,
@@ -571,7 +625,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                     combined_group = new_group;
                 } else {
                     //Union the two groups
-                    combined_group = tree_state.read_union_find[read_num].find_group(combined_group);
                     tree_state.read_union_find[read_num].union_groups(combined_group, new_group);
                     //Find the new distances of the combined groups
                     pair<int64_t, int64_t>& old_dists =
@@ -589,7 +642,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                     dists = make_pair(
                           min_not_minus_one(old_dists.first, dists.first),
                           min_not_minus_one(old_dists.second, dists.second));
-                    tree_state.read_cluster_dists[read_num][new_group] = dists;
                     tree_state.read_cluster_dists[read_num][combined_group] = dists;
 #ifdef DEBUG_CLUSTER
                     cerr << " New dists for read num " << read_num << ": "
@@ -598,16 +650,16 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
 #endif
                 }
 
-                if (tree_state.fragment_distance_limit != 0) {
+                if (tree_state.fragment_distance_limit != 0 && fragment_dist != -1) {
                     if (fragment_combined_group != -1) {
-                    //If we're keeping track of fragment clusters, union this
+                    //If we're also keeping track of fragment clusters
                         tree_state.fragment_union_find.union_groups(fragment_combined_group,
                                                             new_group + tree_state.read_index_offsets[read_num]);
                     }
                     fragment_combined_group = tree_state.fragment_union_find.find_group(
                                                          new_group + tree_state.read_index_offsets[read_num]);
                 }
-            } else if (tree_state.fragment_distance_limit != 0 &&
+            } else if (tree_state.fragment_distance_limit != 0 && fragment_dist != -1 &&
                         fragment_dist <= tree_state.fragment_distance_limit) {
                 //If these aren't in the same read cluster but are in
                 //the same fragment cluster
@@ -745,8 +797,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
 #endif
 
 
-            //Need to remember this to check if snarl clusters overlap the old
-            //best distance
+            //Remember the distances of the chain clusters, since we will be writing over them
+            //as we go
             int64_t fragment_chain_right = chain_clusters.fragment_best_right;
             vector<int64_t> read_chain_right = std::move(chain_clusters.read_best_right);
 
@@ -768,19 +820,18 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             chain_clusters.read_best_right.assign(tree_state.all_seeds->size(), -1);
             for (pair<size_t, size_t> cluster_head : snarl_clusters.read_cluster_heads) {
                 // For each of the clusters for the current snarl,
-                // first check if it can be combined with any other
-                // snarl clusters by taking loops in the chain,
+                // first check if it can be combined with another cluster
+                // in the same snarl by taking loops in the chain,
                 // then, find if it belongs to the new combined cluster
                 // that includes chain clusters
                 size_t read_num = cluster_head.first;
 
                 pair<int64_t, int64_t> snarl_dists =
-                                        std::move(tree_state.read_cluster_dists[read_num][cluster_head.second]);
+                        std::move(tree_state.read_cluster_dists[read_num][cluster_head.second]);
 
                 if (loop_dist_start != -1) {
                     //If there is a loop going out and back into the start of
-                    //the snarl, might combine this cluster with other snarl
-                    //clusters
+                    //the snarl, this cluster may be combined with other snarl clusters
 
                     //The distance to the right side of the snarl
                     // that is found by taking the leftmost seed and
@@ -807,20 +858,21 @@ cerr << "  (Possibly) updating looping distance to right of snarl cluster " << r
 cerr << "  Combining this cluster from the left " ;
 #endif
                         int64_t read_dist =  snarl_clusters.read_best_left[read_num] == -1 ? -1 :  
-                                     snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1;
-                        combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], fragment_snarl_cluster_left,
-                                     to_erase, snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1,
-                                     read_dist, snarl_dists, read_num);
+                                    snarl_clusters.read_best_left[read_num] + snarl_dists.first + loop_dist_start - start_length - 1;
+                        int64_t fragment_dist = snarl_clusters.fragment_best_left == -1 ? -1 :
+                                    snarl_clusters.fragment_best_left + snarl_dists.first + loop_dist_start - start_length - 1;
+
+                        combine_snarl_clusters(cluster_head.second, snarl_cluster_left[read_num], 
+                                fragment_snarl_cluster_left,  to_erase, fragment_dist, read_dist, snarl_dists, read_num);
                     }
 
                 }
 
                 if (loop_dist_end != -1) {
                     //If there is a loop to the right
-                    int64_t new_left = snarl_dists.second == -1 || loop_dist_end == -1
-                        ? -1
+                    int64_t new_left = snarl_dists.second == -1 || loop_dist_end == -1 ? -1
                           : snarl_dists.second + loop_dist_end + snarl_length - end_length;
-                    if (snarl_dists.first == -1 || (new_left != -1 & new_left < snarl_dists.first)){
+                    if (snarl_dists.first == -1 || (new_left != -1 && new_left < snarl_dists.first)){
                         //If this is an improvement, update distances
                         snarl_dists.first = new_left;
                         snarl_clusters.read_best_left[read_num] = 
@@ -833,73 +885,37 @@ cerr << "Updating looping distance to left of snarl cluster " << read_num << ":"
 #endif
                     }
 
-                    if (snarl_clusters.fragment_best_right != -1 && snarl_dists.second != -1 ) {
+                    if (snarl_clusters.fragment_best_right != -1 && snarl_dists.second != -1) {
                         //If this cluster can be combined with another cluster
                         //from the right
 
 #ifdef DEBUG_CLUSTER
-cerr << "  Combining this cluster from the right" << endl;
+cerr << "  Maybe combining this cluster from the right" << endl;
 #endif
                         int64_t read_dist = snarl_clusters.read_best_right[read_num] == -1 ? -1 :
                             snarl_clusters.read_best_right[read_num] + snarl_dists.second  + loop_dist_end - end_length - 1;
+                        int64_t fragment_dist = snarl_clusters.fragment_best_right == -1 ? -1 : 
+                            snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1;
+
                         combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num],
-                             fragment_snarl_cluster_right, to_erase,
-                            snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1,
+                             fragment_snarl_cluster_right, to_erase,fragment_dist,
                             read_dist, snarl_dists, read_num);
                     }
                 }
 
                 //Now check if this snarl cluster can be combined with any
                 //existing chain clusters
-                if (read_chain_right[read_num] != -1 && snarl_dists.first != -1 &&
-                    snarl_dists.first + read_chain_right[read_num] - start_length-1
-                                                <= tree_state.read_distance_limit) {
-                    //If this snarl cluster's leftmost seed is close enough to
-                    //the rightmost seed in the chain (up to this point), then
-                    //this snarl cluster is in the combined cluster
-
-                    if (combined_cluster[read_num] == -1) {
-                        combined_cluster[read_num] = cluster_head.second;
-                        combined_left[read_num] = snarl_dists.first == -1 ? -1 :
-                                            snarl_dists.first + add_dist_left;
-                        combined_right[read_num] = snarl_dists.second;
-                    } else {
-                        //Cluster
-                        tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second);
-                        size_t new_group  = tree_state.read_union_find[read_num].find_group(cluster_head.second);
-
-                        if (new_group == cluster_head.second) {
-                            to_erase.emplace_back(read_num,combined_cluster[read_num]);
-                        } else {
-                            to_erase.push_back(cluster_head);
-                        }
-
-                        combined_cluster[read_num] = new_group;
-                        combined_left[read_num] = min_not_minus_one(combined_left[read_num],
-                                    snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left);
-                        combined_right[read_num] = min_not_minus_one(combined_right[read_num],snarl_dists.second);
-                    }
-                    if (tree_state.fragment_distance_limit != 0) {
-                        if (fragment_combined_cluster != -1) {
-                            //Also cluster by fragment
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, 
-                                                    cluster_head.second+tree_state.read_index_offsets[read_num]);
-                        }
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
-                    }
-                } else {
-                    //If the snarl cluster does not get combined with any of
-                    //the existing chain clusters, then it becomes a new chain cluster
-                    if (tree_state.fragment_distance_limit != 0 && fragment_chain_right != -1 && snarl_dists.first != -1 &&
-                           snarl_dists.first+fragment_chain_right-start_length-1 <= tree_state.fragment_distance_limit) {
-                        //Cluster in the same fragment but not the same read
-                        if (fragment_combined_cluster != -1) {
-                            //Also cluster by fragment
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, 
-                                                    cluster_head.second+tree_state.read_index_offsets[read_num]);
-                        }
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
-                    }
+                int64_t read_dist = read_chain_right[read_num] == -1 || snarl_dists.first == -1 ? -1 :
+                    snarl_dists.first + read_chain_right[read_num] - start_length-1;
+                int64_t fragment_dist = tree_state.fragment_distance_limit == 0 || fragment_chain_right == -1 || snarl_dists.first == -1 
+                        ? -1 : snarl_dists.first+fragment_chain_right-start_length-1;
+                pair<int64_t, int64_t> new_snarl_dists (snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left,
+                                                        snarl_dists.second);
+                bool combined_read = combine_chain_clusters (cluster_head.second,combined_cluster, fragment_combined_cluster,
+                        combined_left, combined_right, new_snarl_dists, to_erase, fragment_dist, read_dist, read_num);
+
+                if ( ! combined_read) {
+                    //Create new chain cluster from snarl cluster
                     to_add.push_back(cluster_head);
                     //Update its distances to the correct nodes in the chain
                     pair<int64_t, int64_t> d = make_pair(snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left,
@@ -919,54 +935,21 @@ cerr << "  Combining this cluster from the right" << endl;
             //if they get combined with snarl clusters
             for (pair<size_t, size_t> cluster_head : chain_clusters.read_cluster_heads) {
                 //For each old chain cluster
+                
+                pair<int64_t, int64_t>& chain_dists = tree_state.read_cluster_dists[cluster_head.first][cluster_head.second];
                 size_t read_num = cluster_head.first;
-                pair<int64_t, int64_t>& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second];
 
-                if (snarl_clusters.read_best_left[read_num] != -1 && chain_dists.second != -1
-                     && chain_dists.second + snarl_clusters.read_best_left[read_num]
-                                - start_length-1 <= tree_state.read_distance_limit){
-                    //If this chain cluster's rightmost seed is close enough
-                    //to the leftmost seed of any cluster in the snarl, then
-                    //this chain cluster is in the combined cluster
+                int64_t read_dist = snarl_clusters.read_best_left[read_num] == -1 || chain_dists.second == -1 ? -1 :
+                                     chain_dists.second + snarl_clusters.read_best_left[read_num] - start_length-1 ;
+                int64_t fragment_dist = tree_state.fragment_distance_limit == 0 ||
+                        snarl_clusters.fragment_best_left == -1 || chain_dists.second == -1 ? -1 :
+                        chain_dists.second + snarl_clusters.fragment_best_left - start_length-1;
+                pair<int64_t, int64_t> new_chain_dists (chain_dists.first, chain_dists.second + dist_to_end);
+                bool combined_read = combine_chain_clusters (cluster_head.second,combined_cluster, fragment_combined_cluster,
+                        combined_left, combined_right, new_chain_dists,  to_erase, fragment_dist, read_dist, cluster_head.first);
 
-                    if (combined_cluster[read_num] == -1) {
-                        //New chain cluster
-                        combined_cluster[read_num] = cluster_head.second;
-                        combined_left[read_num] = chain_dists.first;
-                        combined_right[read_num] = chain_dists.second + dist_to_end;
-                    } else {
-                        //Combine
-                        tree_state.read_union_find[read_num].union_groups(combined_cluster[read_num], cluster_head.second);
-                        size_t new_group = tree_state.read_union_find[read_num].find_group(cluster_head.second);
-                        if (new_group == cluster_head.second) {
-                            to_erase.emplace_back(read_num,combined_cluster[read_num]);
-                        } else {
-                            to_erase.push_back(cluster_head);
-                        }
-                        combined_cluster[read_num] = new_group;
-                        combined_left[read_num] = min_not_minus_one(combined_left[read_num], chain_dists.first);
-                        combined_right[read_num] = min_not_minus_one(combined_right[read_num], chain_dists.second + dist_to_end);
-                    }
-                    if (tree_state.fragment_distance_limit != 0) {
-                        if (fragment_combined_cluster != -1) {
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]);
-                        }
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
-                    }
-                } else {
-                    //If this chain cluster is on its own, extend its right
-                    //distance to the end of the current snarl
 
-                    if (tree_state.fragment_distance_limit != 0 &&
-                        snarl_clusters.fragment_best_left != -1 && chain_dists.second != -1
-                        && chain_dists.second + snarl_clusters.fragment_best_left
-                                - start_length-1 <= tree_state.fragment_distance_limit) {
-                        //If this is a new read cluster but the same fragment cluster
-                        if (fragment_combined_cluster != -1) {
-                            tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]);
-                        }
-                        fragment_combined_cluster = tree_state.fragment_union_find.find_group(cluster_head.second+tree_state.read_index_offsets[read_num]);
-                    }
+                if (!combined_read) {
                     chain_dists.second += dist_to_end;
                     if ((tree_state.fragment_distance_limit == 0 &&
                          chain_dists.first - 2 >= tree_state.read_distance_limit &&
diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp
index 92c6b604f46..e35f877d4a7 100644
--- a/src/seed_clusterer.hpp
+++ b/src/seed_clusterer.hpp
@@ -225,10 +225,10 @@ class SnarlSeedClusterer {
 
         //Cluster all the snarls at the current level and update the tree_state
         //to add each of the snarls to the parent level
-        void cluster_snarls(TreeState& tree_state, size_t depth) const;
+        void cluster_snarl_level(TreeState& tree_state, size_t depth) const;
 
         //Cluster all the chains at the current level
-        void cluster_chains(TreeState& tree_state, size_t depth) const;
+        void cluster_chain_level(TreeState& tree_state, size_t depth) const;
 
         //Cluster the seeds on the specified node
         NodeClusters cluster_one_node(TreeState& tree_state, 

From 3bc83bc81805287d61dc9ec178b76eb4d13dd295 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 12 Nov 2019 09:17:06 -0500
Subject: [PATCH 39/79] Revert "let poisson caller use min-ad and het-bias from
 old caller for now (hopefully temporary)"

This reverts commit fd5b96731d4502817366baef5e3c5555c1063899.
---
 src/snarl_caller.cpp         | 131 ++++++++++++++---------------------
 src/snarl_caller.hpp         |  38 +++++-----
 src/subcommand/call_main.cpp |  13 ++--
 3 files changed, 79 insertions(+), 103 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 475b6454666..d22171ba617 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -54,19 +54,6 @@ void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double m
     }
 }
 
-void SupportBasedSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) {
-    // want to move away from ugly hacks that treat the reference traversal differently,
-    // so keep all these set the same
-    if (het_bias >= 0) {
-        max_het_bias = het_bias;
-        max_ref_het_bias = het_bias;
-        max_indel_het_bias = het_bias;
-    }
-    if (ref_het_bias >= 0) {
-        max_ref_het_bias = ref_het_bias;
-    }
-}
-
 int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, const vector<int>& skips) {
     int best_allele = -1;
     for(size_t i = 0; i < supports.size(); i++) {
@@ -78,38 +65,6 @@ int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, c
     return best_allele;
 }
 
-double SupportBasedSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
-                                         int second_best_trav, int ref_trav_idx) const {
-    bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) ||
-                     (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx]));
-
-    double bias_limit = 1;
-
-    if (best_trav >= 0 && second_best_trav >=0) {
-        if (best_trav == ref_trav_idx) {
-            // Use ref bias limit
-            
-            // We decide closeness differently depending on whether best is ref
-            // or not. In practice, we use this to slightly penalize homozygous
-            // ref calls (by setting max_ref_het_bias higher than max_het_bias)
-            // and rather make a less supported alt call instead.  This boost
-            // max sensitivity, and because everything is homozygous ref by
-            // default in VCF, any downstream filters will effectively reset
-            // these calls back to homozygous ref. TODO: This shouldn't apply
-            // when off the primary path!
-            bias_limit = max_ref_het_bias;
-        } else if (is_indel) {
-            // This is an indel
-            // Use indel bias limit
-            bias_limit = max_indel_het_bias;
-        } else {
-            // Use normal het bias limit
-            bias_limit = max_het_bias;
-        }
-    }
-    return bias_limit;
-}
-
 RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
                                                  TraversalSupportFinder& support_finder) :
     SupportBasedSnarlCaller(graph, snarl_manager, support_finder)  {
@@ -119,6 +74,19 @@ RatioSupportSnarlCaller::~RatioSupportSnarlCaller() {
     
 }
 
+void RatioSupportSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) {
+    // want to move away from ugly hacks that treat the reference traversal differently,
+    // so keep all these set the same
+    if (het_bias >= 0) {
+        max_het_bias = het_bias;
+        max_ref_het_bias = het_bias;
+        max_indel_het_bias = het_bias;
+    }
+    if (ref_het_bias >= 0) {
+        max_ref_het_bias = ref_het_bias;
+    }
+}
+
 vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
                                               const vector<SnarlTraversal>& traversals,
                                               int ref_trav_idx,
@@ -450,6 +418,39 @@ function<bool(const SnarlTraversal&)> RatioSupportSnarlCaller::get_skip_allele_f
     };
 }
 
+double RatioSupportSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
+                                         int second_best_trav, int ref_trav_idx) const {
+    bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) ||
+                     (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx]));
+
+    double bias_limit = 1;
+
+    if (best_trav >= 0 && second_best_trav >=0) {
+        if (best_trav == ref_trav_idx) {
+            // Use ref bias limit
+            
+            // We decide closeness differently depending on whether best is ref
+            // or not. In practice, we use this to slightly penalize homozygous
+            // ref calls (by setting max_ref_het_bias higher than max_het_bias)
+            // and rather make a less supported alt call instead.  This boost
+            // max sensitivity, and because everything is homozygous ref by
+            // default in VCF, any downstream filters will effectively reset
+            // these calls back to homozygous ref. TODO: This shouldn't apply
+            // when off the primary path!
+            bias_limit = max_ref_het_bias;
+        } else if (is_indel) {
+            // This is an indel
+            // Use indel bias limit
+            bias_limit = max_indel_het_bias;
+        } else {
+            // Use normal het bias limit
+            bias_limit = max_het_bias;
+        }
+    }
+    return bias_limit;
+}
+
+
 PoissonSupportSnarlCaller::PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
                                                      TraversalSupportFinder& support_finder,
                                                      const algorithms::BinnedDepthIndex& depth_index) :
@@ -562,7 +563,7 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     double best_genotype_likelihood = -numeric_limits<double>::max();
     vector<int> best_genotype;
     for (const auto& candidate : candidates) {
-        double gl = genotype_likelihood(candidate.first, candidate.second, traversals, traversal_sizes, ref_trav_idx, exp_depth, depth_err);
+        double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err);
         if (gl > best_genotype_likelihood) {
             best_genotype_likelihood = gl;
             best_genotype = candidate.first;
@@ -578,7 +579,6 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
 double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotype,
                                                       const vector<Support>& genotype_supports,
                                                       const vector<SnarlTraversal>& traversals,
-                                                      const vector<int>& traversal_sizes,
                                                       int ref_trav_idx, double exp_depth, double depth_err) {
     
     assert(genotype_supports.size() == genotype.size());
@@ -602,24 +602,6 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
             fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size();
         }
     }
-
-    // we preserve our het-bias from RatioSupportSnarlCaller as something prior-like here
-    // todo: use something better
-    double het_prior = 0.;
-    double ew_prior = 0.;
-    if (genotype.size() == 2) {
-        double het_bias = get_bias(traversal_sizes, genotype[0], genotype[1], ref_trav_idx);
-        if (genotype[0] != genotype[1]) {
-            // if the het_bias is greater than 1 (usually it's 6 by default), then
-            // we get a prior of 5/6 for a het
-            het_prior = log(1. - 1. / het_bias);
-            ew_prior = ewens_af_prob_ln({2, 0}, 0.001);
-        } else {
-            // and 1/6 for a hom
-            het_prior = log(1. / het_bias);
-            ew_prior = ewens_af_prob_ln({0, 1}, 0.001);
-        }
-    }    
     
     // total support of the site
     Support total_site_support = total_other_support;
@@ -661,12 +643,11 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     }
 
 #ifdef debug
-    cerr  << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood << " prior " << het_prior
-          << " ew prior " << ew_prior
-          << " total-prob " << (alleles_log_likelihood + other_log_likelihood + het_prior) << endl;
+    cerr  << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood
+          << " total-prob " << (alleles_log_likelihood + other_log_likelihood) << endl;
 #endif
 
-    return alleles_log_likelihood + other_log_likelihood + het_prior;
+    return alleles_log_likelihood + other_log_likelihood;
 }
 
 void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
@@ -687,8 +668,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     }
     double total_site_depth = support_val(site_support);
 
-    vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
-
     // Set the variant's total depth            
     string depth_string = std::to_string((int64_t)round(total_site_depth));
     variant.format.push_back("DP");
@@ -700,7 +679,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     // get the allele depths
     variant.format.push_back("AD");
     set<int> called_allele_set(genotype.begin(), genotype.end());
-    double min_site_support = genotype.size() > 0 ? INFINITY : 0;
 
     for (int i = 0; i < traversals.size(); ++i) {
         vector<int> shared_travs;
@@ -720,10 +698,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         // there is certainly room for optimization via remembering some of this stuff here
         vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false);
         variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i]))));
-        if (in_genotype) {
-            // update the minimum support
-            min_site_support = min(min_site_support, total(allele_supports[i]));
-        }
     }
 
     // get the genotype likelihoods
@@ -757,7 +731,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
                 gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false);
                 genotype_supports.push_back(gt_supports[j]);
             }
-            double gl = genotype_likelihood({i, j}, genotype_supports, traversals, traversal_sizes, 0, exp_depth, depth_err);
+            double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err);
             // convert from natural log to log10 by dividing by ln(10)
             variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258));
 
@@ -769,9 +743,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         }
     }
 
-    // use old quality for now
-    variant.quality = min_site_support;
-
     // todo
     /*
     // Now do the filters
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index e2d7aa78f5c..1c3deace343 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -75,8 +75,6 @@ class SupportBasedSnarlCaller : public SnarlCaller {
 
     /// Set some of the parameters
     void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support);
-
-    void set_het_bias(double het_bias, double ref_het_bias = 0.);
     
     /// Get the traversal support finder
     const TraversalSupportFinder& get_support_finder() const;
@@ -92,12 +90,6 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// Relic from old code
     static double support_val(const Support& support) { return total(support); };
 
-    /// Get the bias used to for comparing two traversals
-    /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel
-    ///  see tuning parameters below)
-    double get_bias(const vector<int>& traversal_sizes, int best_trav,
-                    int second_best_trav, int ref_trav_idx) const;
-
     const PathHandleGraph& graph;
 
     SnarlManager& snarl_manager;    
@@ -115,15 +107,6 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// what's the minimum total support (over all alleles) of the site to make
     /// a call
     size_t min_site_depth = 3;
-    /// What fraction of the reads supporting an alt are we willing to discount?
-    /// At 2, if twice the reads support one allele as the other, we'll call
-    /// homozygous instead of heterozygous. At infinity, every call will be
-    /// heterozygous if even one read supports each allele.
-    double max_het_bias = 6;
-    /// Like above, but applied to ref / alt ratio (instead of alt / ref)
-    double max_ref_het_bias = 6;
-    /// Like the max het bias, but applies to novel indels.
-    double max_indel_het_bias = 6;
 };
 
 
@@ -137,6 +120,9 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller {
                             TraversalSupportFinder& support_finder);
     virtual ~RatioSupportSnarlCaller();
 
+    /// Set some of the parameters
+    void set_het_bias(double het_bias, double ref_het_bias = 0.);
+
     /// Get the genotype of a site
     virtual vector<int> genotype(const Snarl& snarl,
                                  const vector<SnarlTraversal>& traversals,
@@ -160,12 +146,27 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller {
 
 protected:
 
+    /// Get the bias used to for comparing two traversals
+    /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel
+    ///  see tuning parameters below)
+    double get_bias(const vector<int>& traversal_sizes, int best_trav,
+                    int second_best_trav, int ref_trav_idx) const;
+
     /// get a map of the beginning of a node (in forward orientation) on a traversal
     /// used for up-weighting large deletion edges in complex snarls with average support
     unordered_map<id_t, size_t> get_ref_offsets(const SnarlTraversal& ref_trav) const;
 
     /// Tuning
 
+    /// What fraction of the reads supporting an alt are we willing to discount?
+    /// At 2, if twice the reads support one allele as the other, we'll call
+    /// homozygous instead of heterozygous. At infinity, every call will be
+    /// heterozygous if even one read supports each allele.
+    double max_het_bias = 6;
+    /// Like above, but applied to ref / alt ratio (instead of alt / ref)
+    double max_ref_het_bias = 6;
+    /// Like the max het bias, but applies to novel indels.
+    double max_indel_het_bias = 6;
     /// Used for calling 1/2 calls.  If both alts (times this bias) are greater than
     /// the reference, the call is made.  set to 0 to deactivate.
     double max_ma_bias = 0;
@@ -217,14 +218,13 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
     double genotype_likelihood(const vector<int>& genotype,
                                const vector<Support>& genotype_supports,
                                const vector<SnarlTraversal>& traversals,
-                               const vector<int>& traversal_sizes,                               
                                int ref_trav_idx, double exp_depth, double depth_err);
 
     /// Rank supports
     vector<int> rank_by_support(const vector<Support>& supports);
 
     /// Baseline mapping error rate (gets added to the standard error from coverage)
-    double baseline_mapping_error = 0.005;
+    double baseline_mapping_error = 0.05;
 
     /// Consider up to the top-k traversals (based on support) for genotyping
     size_t top_k = 25;
diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index 4a611c67303..3f69185b0c9 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -224,6 +224,11 @@ int main_call(int argc, char** argv) {
         cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl;
         return 1;
     }
+    // Check bias option
+    if (!bias_string.empty() && !ratio_caller) {
+        cerr << "error [vg call]: -b can only be used with -B" << endl;
+        return 1;
+    }
 
     // No paths specified: use them all
     if (ref_paths.empty()) {
@@ -268,21 +273,21 @@ int main_call(int argc, char** argv) {
 
         if (ratio_caller == false) {
             // Make a depth index
-            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 50000, 0, true, true);
+            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true);
             // Make a new-stype probablistic caller
             auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index);
             packed_caller = poisson_caller;
         } else {
             // Make an old-style ratio support caller
             auto ratio_caller = new RatioSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder);
+            if (het_bias >= 0) {
+                ratio_caller->set_het_bias(het_bias, ref_het_bias);
+            }
             packed_caller = ratio_caller;
         }
         if (min_allele_support >= 0) {
             packed_caller->set_min_supports(min_allele_support, min_allele_support, min_site_support);
         }
-        if (het_bias >= 0) {
-            packed_caller->set_het_bias(het_bias, ref_het_bias);
-        }
         
         snarl_caller = unique_ptr<SnarlCaller>(packed_caller);
     }

From 4713e296b3e189ee6a147dbd0a32744fc6be9954 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 12 Nov 2019 09:28:30 -0500
Subject: [PATCH 40/79] fix bug in support splitting and other tweaks

---
 src/snarl_caller.cpp | 21 +++++++++++----------
 src/snarl_caller.hpp |  2 +-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index d22171ba617..f96afd7c2f1 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -596,8 +596,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     // split the homozygous support into two
     // from now on we'll treat it like two separate observations, each with half coverage
     vector<Support> fixed_genotype_supports = genotype_supports;
-    if (std::equal(genotype_supports.begin() + 1, genotype_supports.end(), genotype_supports.begin(),
-                   [&](const Support& s1, const Support& s2) { return support_val(s1) == support_val(s2); })) {
+    if (std::equal(genotype.begin() + 1, genotype.end(), genotype.begin())) {
         for (int i = 0; i < genotype_supports.size(); ++i) {
             fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size();
         }
@@ -679,6 +678,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     // get the allele depths
     variant.format.push_back("AD");
     set<int> called_allele_set(genotype.begin(), genotype.end());
+    double min_site_support = genotype.size() > 0 ? INFINITY : 0;
 
     for (int i = 0; i < traversals.size(); ++i) {
         vector<int> shared_travs;
@@ -698,6 +698,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         // there is certainly room for optimization via remembering some of this stuff here
         vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false);
         variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i]))));
+        if (in_genotype) {
+            // update the minimum support
+            min_site_support = min(min_site_support, total(allele_supports[i]));
+        }
     }
 
     // get the genotype likelihoods
@@ -743,22 +747,19 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         }
     }
 
-    // todo
-    /*
+    // use old quality for now
+    variant.quality = min_site_support;
+
     // Now do the filters
+    // todo: fix and share with other caller
     variant.filter = "PASS";            
     if (min_site_support < min_mad_for_filter) {
         // Apply Min Allele Depth cutoff across all alleles (even ref)
         variant.filter = "lowad";
-    } else if (min_ad_log_likelihood_for_filter != 0 &&
-               ad_log_likelihood < min_ad_log_likelihood_for_filter) {
-        // We have a het, but the assignment of reads between the two branches is just too weird
-        variant.filter = "lowxadl";
-    } else if ((int64_t)round(total(total_support)) < min_site_depth) {
+    } else if ((int64_t)round(total_site_depth) < min_site_depth) {
         // we don't have enough support to want to make a call
         variant.filter = "lowdepth";
     }
-    */
 }
 
 void PoissonSupportSnarlCaller::update_vcf_header(string& header) const {
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index 1c3deace343..e9e92571c09 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -224,7 +224,7 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
     vector<int> rank_by_support(const vector<Support>& supports);
 
     /// Baseline mapping error rate (gets added to the standard error from coverage)
-    double baseline_mapping_error = 0.05;
+    double baseline_mapping_error = 0.005;
 
     /// Consider up to the top-k traversals (based on support) for genotyping
     size_t top_k = 25;

From 42aba1d7b130cbee588e1b94ab7c54b65a6b9ef5 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 12 Nov 2019 11:40:33 -0500
Subject: [PATCH 41/79] clean up some traversal support computation

---
 src/snarl_caller.cpp      | 152 +++++++++++++-------------------------
 src/snarl_caller.hpp      |   1 -
 src/traversal_support.cpp |  65 ++++++++++++----
 src/traversal_support.hpp |  32 +++++---
 4 files changed, 126 insertions(+), 124 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index f96afd7c2f1..b8ae480e2f6 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -102,7 +102,7 @@ vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
     vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
 
     // get the supports of each traversal independently
-    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx);
+    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, {}, false, false, false, ref_trav_idx);
     int best_allele = get_best_support(supports, {});
 
 #ifdef debug
@@ -117,7 +117,7 @@ vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
 
     // we prune out traversals whose exclusive support (structure that is not shared with best traversal)
     // doesn't meet a certain cutoff
-    vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx);    
+    vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, true, false, false, ref_trav_idx);    
     vector<int> skips = {best_allele};
     for (int i = 0; i < secondary_exclusive_supports.size(); ++i) {
         double bias = get_bias(traversal_sizes, i, best_allele, ref_trav_idx);
@@ -130,7 +130,7 @@ vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
         }
     }
     // get the supports of each traversal in light of best
-    vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx);
+    vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, false, false, false, ref_trav_idx);
     int second_best_allele = get_best_support(secondary_supports, {skips});
 
     // get the supports of each traversal in light of second best
@@ -139,7 +139,7 @@ vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
     int third_best_allele = -1;
     if (second_best_allele != -1) {
         // prune out traversals whose exclusive support relative to second best doesn't pass cut
-        vector<Support> tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, true, false, false, ref_trav_idx);
+        vector<Support> tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, {}, true, false, false, ref_trav_idx);
         skips.push_back(best_allele);
         skips.push_back(second_best_allele);
         for (int i = 0; i < tertiary_exclusive_supports.size(); ++i) {
@@ -148,7 +148,7 @@ vector<int> RatioSupportSnarlCaller::genotype(const Snarl& snarl,
                 skips.push_back(i);
             }
         }
-        tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, false, false, false, ref_trav_idx);
+        tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, {}, false, false, false, ref_trav_idx);
         third_best_allele = get_best_support(tertiary_supports, skips);
     }
 
@@ -298,10 +298,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     }
     // compute the support of our called alleles
     // todo: I think this undercounts support.  shuold be fixed (as in Poisson version)
-    vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, false, false, 0);
-
-    // get the support of our uncalled alleles, making sure to not include any called support
-    vector<Support> uncalled_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, 0);
+    vector<Support> allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0);
         
     // Set up the depth format field
     variant.format.push_back("DP");
@@ -314,7 +311,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     variant.format.push_back("XAAD");
 
     // Compute the total support for all the alts that will be appearing
-    Support total_support;
+    Support total_support = support_finder.get_total_traversal_set_support(traversals, 0);
     // And total alt allele depth for the alt alleles
     Support alt_support;
     // Find the min total support of anything called
@@ -323,14 +320,11 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     if (!allele_supports.empty()) { //only add info if we made a call
         for (int allele = 0; allele < traversals.size(); ++allele) {
             bool is_called = called_allele_set.count(allele);
-            auto& support = is_called ? allele_supports[allele] : uncalled_supports[allele];
+            auto& support = allele_supports[allele];
             
             // Set up allele-specific stats for the allele
             variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(total(support))));
                 
-            // Sum up into total depth
-            total_support += support;
-                
             if (allele != 0) {
                 // It's not the primary reference allele
                 alt_support += support;
@@ -482,14 +476,14 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
 
     // get the supports of each traversal independently
-    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false, ref_trav_idx);
+    vector<Support> supports = support_finder.get_traversal_set_support(traversals, {}, {}, false, false, false, ref_trav_idx);
 
     // sort the traversals by support
     vector<int> ranked_traversals = rank_by_support(supports);
     size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size());
 
     // the candidate genotypes and their supports.  the numbers here are alleles as indexed in traversals[]
-    map<vector<int>, vector<Support>> candidates;
+    set<vector<int>> candidates;
 
     // pre-filter out some alleles based on poor exclusive support
     set<int> skips;
@@ -504,13 +498,13 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
         }
 
         if (ploidy == 1) {
-            candidates[{best_allele}] = {supports[best_allele]}; 
+            candidates.insert({best_allele});
         } else {
             assert(ploidy == 2);
         
             // we prune out traversals whose exclusive support (structure that is not shared with best traversal)
             // doesn't meet a certain cutoff
-            vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, true, false, false, ref_trav_idx);
+            vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, true, false, false, ref_trav_idx);
             for (int j = 0; j < secondary_exclusive_supports.size(); ++j) {
                 if (j != best_allele &&
                     support_val(secondary_exclusive_supports[j]) < min_total_support_for_call &&
@@ -520,32 +514,21 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
             }
 
             // get the supports of each traversal in light of best
-            vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, false, false, false, ref_trav_idx);
+            vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, false, false, false, ref_trav_idx);
             vector<int> ranked_secondary_traversals = rank_by_support(secondary_supports);
 
             // add the homozygous genotype for our best allele
-            candidates[{best_allele, best_allele}] = {supports[best_allele], supports[best_allele]};
+            candidates.insert({best_allele, best_allele});
 
             // now look at the top-k second-best traversals
             size_t sec_count = 0;
             for (int j = 0; j < ranked_secondary_traversals.size() && sec_count < top_k; ++j) {
                 int second_best_allele = ranked_secondary_traversals[j];
                 if (!skips.count(second_best_allele) && second_best_allele != best_allele) {
-                    // second best allele's support, sharing nodes with best
-                    Support& second_best_support = secondary_supports[second_best_allele];
-                    // best allele's support, sharing nodes with second best
-                    Support best_support_het = support_finder.get_traversal_set_support(
-                        {traversals[best_allele], traversals[second_best_allele]},
-                        {1}, false, false, false, ref_trav_idx)[0];
-                                
                     // canonical ordering for our set
-                    if (best_allele < second_best_allele) {
-                        candidates[{best_allele, second_best_allele}] = {best_support_het, second_best_support};
-                    } else {
-                        candidates[{second_best_allele, best_allele}] = {second_best_support, best_support_het};
-                    }
+                    candidates.insert({min(best_allele, second_best_allele), max(best_allele, second_best_allele)});
                     // also make sure we have our homozygous genotype for the second best allele
-                    candidates[{second_best_allele, second_best_allele}] = {supports[second_best_allele], supports[second_best_allele]};
+                    candidates.insert({second_best_allele, second_best_allele});
                     ++sec_count;
                 }
             }
@@ -563,10 +546,10 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     double best_genotype_likelihood = -numeric_limits<double>::max();
     vector<int> best_genotype;
     for (const auto& candidate : candidates) {
-        double gl = genotype_likelihood(candidate.first, candidate.second, traversals, ref_trav_idx, exp_depth, depth_err);
+        double gl = genotype_likelihood(candidate, traversals, ref_trav_idx, exp_depth, depth_err);
         if (gl > best_genotype_likelihood) {
             best_genotype_likelihood = gl;
-            best_genotype = candidate.first;
+            best_genotype = candidate;
         }
     }
 
@@ -577,20 +560,26 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
 }
 
 double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotype,
-                                                      const vector<Support>& genotype_supports,
                                                       const vector<SnarlTraversal>& traversals,
                                                       int ref_trav_idx, double exp_depth, double depth_err) {
     
-    assert(genotype_supports.size() == genotype.size());
     assert(genotype.size() == 1 || genotype.size() == 2);
 
+    // get the total support over the site
+    // todo: bump this to calling method to not recompute for each genotype!!!
+    Support total_site_support = support_finder.get_total_traversal_set_support(traversals, ref_trav_idx);
+
+    // get the genotype support
+    // todo : we aren't using the non-genotype allele supports in this method, add flag to not compute them here!
+    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, ref_trav_idx);
 
-    // we need the support of all traversals *not* in the genotype.
-    Support total_other_support;
-    // we are running in a mode that will ignore stuff in our genotype, and only count the remainders once.
-    vector<Support> other_supports = support_finder.get_traversal_set_support(traversals, genotype, false, true, true, ref_trav_idx);
-    for (auto& other_support : other_supports) {
-        total_other_support += other_support;
+    // get the total support of traversals *not* in the genotype
+    // note that if we sum it up from allele_supports, it will likely be underestimated when using min (instead of avg supports)
+    // so we subtract it out of the total instead
+    Support total_other_support = total_site_support;
+    set<int> genotype_set(genotype.begin(), genotype.end());
+    for (int allele : genotype_set) {
+        total_other_support += -1. * genotype_supports[allele];
     }
 
     // split the homozygous support into two
@@ -598,16 +587,10 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     vector<Support> fixed_genotype_supports = genotype_supports;
     if (std::equal(genotype.begin() + 1, genotype.end(), genotype.begin())) {
         for (int i = 0; i < genotype_supports.size(); ++i) {
-            fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype_supports.size();
+            fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype.size();
         }
     }
     
-    // total support of the site
-    Support total_site_support = total_other_support;
-    for (auto& support : fixed_genotype_supports) {
-        total_site_support += support;
-    }
-
     // how many reads would we expect to not map to our genotype due to error
     double error_rate = std::min(0.25, depth_err + baseline_mapping_error);
     double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support);
@@ -631,12 +614,13 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     
     // now we compute the likelihood of our genotype
     double alleles_log_likelihood = 0;
-    for (int i = 0; i < fixed_genotype_supports.size(); ++i) {
-        double allele_ll = poisson_prob_ln(std::round(support_val(fixed_genotype_supports[i])), allele_poisson_lambda);        
+    for (int allele : genotype) {
+        const Support& allele_support = fixed_genotype_supports[allele];
+        double allele_ll = poisson_prob_ln(std::round(support_val(allele_support)), allele_poisson_lambda);
         alleles_log_likelihood += allele_ll;
 
 #ifdef debug
-        cerr << "  a[" << i <<"]=" << genotype[i] << " sup=" << genotype_supports[i] << " fix-sup=" << fixed_genotype_supports[i]
+        cerr << "  a[" << allele <<"]=" << " sup=" << genotype_supports[allele] << " fix-sup=" << allele_support
              << " prob " << allele_ll << endl;
 #endif        
     }
@@ -658,14 +642,10 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     assert(traversals.size() == variant.alleles.size());
 
     // Get the depth of the site
-    
-    // get the unique supports (useful only for getting a total)
-    vector<Support> unique_supports = support_finder.get_traversal_set_support(traversals, {}, false, true, true, 0);
-    Support site_support;
-    for (const Support& sup : unique_supports) {
-        site_support += sup;
-    }
-    double total_site_depth = support_val(site_support);
+    // todo: pass this down to genotype_likelihood
+
+    Support total_site_support = support_finder.get_total_traversal_set_support(traversals, 0);    
+    double total_site_depth = support_val(total_site_support);
 
     // Set the variant's total depth            
     string depth_string = std::to_string((int64_t)round(total_site_depth));
@@ -677,30 +657,19 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
 
     // get the allele depths
     variant.format.push_back("AD");
-    set<int> called_allele_set(genotype.begin(), genotype.end());
+
+    // get the genotype support
+    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0);
+    set<int> genotype_set(genotype.begin(), genotype.end());
     double min_site_support = genotype.size() > 0 ? INFINITY : 0;
 
+    // update the allele depths
     for (int i = 0; i < traversals.size(); ++i) {
-        vector<int> shared_travs;
-        bool in_genotype = called_allele_set.count(i);
-        if (in_genotype) {
-            // if we're in the genotype, then we share support with other alleles.
-            for (int a : called_allele_set) {
-                if (a != i) {
-                    shared_travs.push_back(a);
-                }
-            }
-        } else {
-            // if we're not in the genotype, then we ignore support of everything in the genotype
-            shared_travs = genotype;
-        }
-        // we recompute all supports for each allele to get it's support relative to the genotype
-        // there is certainly room for optimization via remembering some of this stuff here
-        vector<Support> allele_supports = support_finder.get_traversal_set_support(traversals, shared_travs, false, !in_genotype, false);
-        variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_supports[i]))));
-        if (in_genotype) {
+        Support allele_support = genotype_supports[i];
+        variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_support))));
+        if (genotype_set.count(i)) {
             // update the minimum support
-            min_site_support = min(min_site_support, total(allele_supports[i]));
+            min_site_support = min(min_site_support, total(genotype_supports[i]));
         }
     }
 
@@ -722,28 +691,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     // assume ploidy 2
     for (int i = 0; i < traversals.size(); ++i) {
         for (int j = i; j < traversals.size(); ++j) {
-            vector<Support> genotype_supports;
-            if (i == j) {
-                // put the full support of allele for each copy of homozygous (genotype method expects this)
-                vector<Support> gt_supports = support_finder.get_traversal_set_support(traversals, {}, false, false, false);
-                genotype_supports = {gt_supports[i], gt_supports[i]};
-            } else {
-                // compute each support relative to the other
-                // todo: we can speed this up by saving above, or filtering down traversal list to just our genotype alleles
-                vector<Support> gt_supports = support_finder.get_traversal_set_support(traversals, {j}, false, false, false);
-                genotype_supports.push_back(gt_supports[i]);
-                gt_supports = support_finder.get_traversal_set_support(traversals, {i}, false, false, false);
-                genotype_supports.push_back(gt_supports[j]);
-            }
-            double gl = genotype_likelihood({i, j}, genotype_supports, traversals, 0, exp_depth, depth_err);
+            double gl = genotype_likelihood({i, j}, traversals, 0, exp_depth, depth_err);
             // convert from natural log to log10 by dividing by ln(10)
             variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258));
-
-            // use our likelihood as the VCF quality
-            // todo: check if there's something more conventional to use
-            if ((genotype[0] == i && genotype[1] == j) || (genotype[0] == j && genotype[1] == i)) {
-                variant.quality = logprob_to_phred(gl);
-            }
         }
     }
 
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index e9e92571c09..62f18563f56 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -216,7 +216,6 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
     /// Homozygous alleles are split into two, with half support each
     /// The (natural) logoarithm is returned
     double genotype_likelihood(const vector<int>& genotype,
-                               const vector<Support>& genotype_supports,
                                const vector<SnarlTraversal>& traversals,
                                int ref_trav_idx, double exp_depth, double depth_err);
 
diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp
index e3fed43860a..a3ce23539a5 100644
--- a/src/traversal_support.cpp
+++ b/src/traversal_support.cpp
@@ -60,16 +60,55 @@ tuple<Support, Support, int> TraversalSupportFinder::get_child_support(const Sna
 
 
 Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& traversal) const {
-    return get_traversal_set_support({traversal}, {}, false, false, false).at(0);
+    return get_traversal_set_support({traversal}, {}, {}, false, false, false).at(0);
+}
+
+Support TraversalSupportFinder::get_total_traversal_set_support(const vector<SnarlTraversal>& traversals,
+                                                                int ref_trav_idx) const {
+    // share everything
+    vector<int> shared_travs(traversals.size());
+    for (int i = 0; i < shared_travs.size(); ++i) {
+        shared_travs[i] = i;
+    }
+
+    // get the support of everything, where all shared nodes and edges are scaled by the number of times they're shared
+    vector<Support> supports = get_traversal_set_support(traversals, shared_travs, {}, false, false, true, ref_trav_idx);
+
+    // sum it up
+    Support total;
+    for (const Support& support : supports) {
+        total += support;
+    }
+
+    return total;
+}
+
+vector<Support> TraversalSupportFinder::get_traversal_genotype_support(const vector<SnarlTraversal>& traversals,
+                                                                       const vector<int>& genotype,
+                                                                       int ref_trav_idx) {
+    set<int> tgt_trav_set(genotype.begin(), genotype.end());
+    vector<int> tgt_travs(tgt_trav_set.begin(), tgt_trav_set.end());
+    // get the support of just the alleles in the genotype, evenly splitting shared stuff
+    vector<Support> allele_support = get_traversal_set_support(traversals, tgt_travs, tgt_trav_set, false, false, true, ref_trav_idx);
+    // get the support of everythin else, treating stuff in the genotype alleles as 0
+    vector<Support> other_support = get_traversal_set_support(traversals, tgt_travs, {}, false, true, false, ref_trav_idx);
+    // combine the above two vectors
+    for (int allele : tgt_travs) {
+        other_support[allele] = allele_support[allele];
+    }
+    return other_support;
 }
 
 vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<SnarlTraversal>& traversals,
                                                                   const vector<int>& shared_travs,
+                                                                  const set<int>& tgt_travs,
                                                                   bool exclusive_only,
                                                                   bool exclusive_count,
-                                                                  bool unique,
+                                                                  bool mutual_shared,
                                                                   int ref_trav_idx) const {
-    assert(!unique || (exclusive_count || exclusive_only));
+
+    // mutual_shared only makes sense when everything is shared
+    assert(!mutual_shared || shared_travs.size() == traversals.size() || shared_travs.size() == tgt_travs.size());
     
     // pass 1: how many times have we seen a node or edge
     unordered_map<id_t, int> node_counts;
@@ -139,7 +178,8 @@ vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<S
         max_trav_size = std::max(tot_sizes_all[trav_idx], max_trav_size);
 
         // apply the scaling
-        double scale_factor = ((exclusive_only || exclusive_count) && share_count > 0) ? 0. : 1. / (1. + share_count);
+        double denom_add = mutual_shared ? 0 : 1;
+        double scale_factor = ((exclusive_only || exclusive_count) && share_count > 0) ? 0. : 1. / (denom_add + share_count);
         
         // when looking at exclusive support, we don't normalize by skipped lengths
         if (scale_factor != 0 || !exclusive_only || exclusive_count) {
@@ -156,6 +196,10 @@ vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<S
     };
 
     for (int trav_idx = 0; trav_idx < traversals.size(); ++trav_idx) {
+        // target_set filter here
+        if (!tgt_travs.empty() && !tgt_travs.count(trav_idx)) {
+            continue;
+        }
         const SnarlTraversal& trav = traversals[trav_idx];
         for (int visit_idx = 0; visit_idx < trav.visit_size(); ++visit_idx) {
             const Visit& visit = trav.visit(visit_idx);
@@ -171,17 +215,13 @@ vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<S
                 length = graph.get_length(graph.get_handle(visit.node_id()));
                 if (node_counts.count(visit.node_id())) {
                     share_count = node_counts[visit.node_id()];
-                } else if (unique) {
-                    node_counts[visit.node_id()] = 1;
-                }
+                } 
             } else {
                 // get the child support
                 tie(min_support, avg_support, length) = get_child_support(visit.snarl());
                 if (child_counts.count(visit.snarl())) {
                     share_count = child_counts[visit.snarl()];
-                } else if (unique) {
-                    child_counts[visit.snarl()] = 1;
-                }
+                } 
             }
             if (count_end_nodes || (visit_idx > 0 && visit_idx < trav.visit_size() - 1)) {
                 update_support(trav_idx, min_support, avg_support, length, share_count);
@@ -195,15 +235,14 @@ vector<Support> TraversalSupportFinder::get_traversal_set_support(const vector<S
                 length = get_edge_length(edge, ref_offsets);
                 if (edge_counts.count(edge)) {
                     share_count = edge_counts[edge];                    
-                } else if (unique) {
-                    edge_counts[edge] = 1;
-                }
+                } 
                 update_support(trav_idx, min_support, min_support, length, share_count);
             }
         }
     }
 
     // correct for case where no exclusive support found
+    // or we're ignoring some traversals vg tgt_set interface
     for (int i = 0; i < min_supports_min.size(); ++i) {
         if (!has_support[i]) {
             min_supports_min[i] = Support();
diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp
index aecdddc4807..ff2051cbb5c 100644
--- a/src/traversal_support.hpp
+++ b/src/traversal_support.hpp
@@ -46,19 +46,33 @@ class TraversalSupportFinder {
     /// Child snarls are handled as in the old call code: their maximum support is used
     virtual Support get_traversal_support(const SnarlTraversal& traversal) const;
 
-    /// Get the support of a set of traversals.  Any support overlapping traversals in shared_travs
-    /// will have their support split.  If exclusive_only is true, then any split support gets
-    /// rounded down to 0 (and ignored when computing mins or averages) .
-    /// exclusive_count is like exclusive only except shared traversals will be counted (as 0)
-    /// when doing average and min support
-    /// if the ref_trav_idx is given, it will be used for computing (deletion) edge lengths
-    /// if unique is true, then every node or edge will only be counted once
-    /// (useful for total support)
+    /// wrapper for using get_traversal_set_support to get the total support
+    /// (sets shared_travs to the whole set, mutual_shared to true, then
+    /// sums over the results)
+    virtual Support get_total_traversal_set_support(const vector<SnarlTraversal>& traversals,
+                                                    int ref_trav_idx = -1) const;
+
+    /// wrapper for using get_traversal_set_support to get the support for
+    /// some alleles in a genotype, where everything is split evently among them
+    /// anything not in the genotype gets a support using "exclusive_count"
+    /// where nodes taken by the genotype are counted as 0
+    virtual vector<Support> get_traversal_genotype_support(const vector<SnarlTraversal>& traversals,
+                                                         const vector<int>& genotype,
+                                                         int ref_trav_idx = -1);
+    
+    /// traversals:      get support for each traversal in this set
+    /// shared_travs:    if a node appears N times in shared_travs, then it will count as 1 / (N+1) support
+    /// tgt_travs:       if not empty, only compute support for these traversals (remaining slots in output vector left 0)
+    /// eclusive_only:   shared_travs are completely ignored
+    /// exclusive_count: anything in shared_travs treated as 0
+    /// mutual_shared:   shared_travs count as 1/N support (instead of 1/(N+1)).  usefuly for total support
+    /// ref_trav_idx:    index of reference traversal if known
     virtual vector<Support> get_traversal_set_support(const vector<SnarlTraversal>& traversals,
                                                       const vector<int>& shared_travs,
+                                                      const set<int>& tgt_travs,
                                                       bool exclusive_only,
                                                       bool exclusive_count,
-                                                      bool unique,
+                                                      bool mutual_shared,
                                                       int ref_trav_idx = -1) const;
 
     /// Get the total length of all nodes in the traversal

From 278ec93d3443621fe343e9daa884b548e67994d0 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 12 Nov 2019 15:43:09 -0500
Subject: [PATCH 42/79] float depths. deprecated total support function that
 doesnt work

---
 src/algorithms/coverage_depth.cpp | 26 +++++++++++++---
 src/algorithms/coverage_depth.hpp |  6 ++--
 src/snarl_caller.cpp              | 51 ++++++++++++++-----------------
 src/traversal_support.cpp         | 20 ------------
 src/traversal_support.hpp         |  8 +----
 5 files changed, 48 insertions(+), 63 deletions(-)

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index 6d28335f4d7..7429a3b6e0d 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -130,18 +130,18 @@ vector<tuple<size_t, size_t, double, double>> binned_packed_depth(const Packer&
     return binned_depths;
 }
 
-unordered_map<string, map<size_t, pair<double, double>>> binned_packed_depth_index(const Packer& packer,
+unordered_map<string, map<size_t, pair<float, float>>> binned_packed_depth_index(const Packer& packer,
                                                                                    const vector<string>& path_names,
                                                                                    size_t bin_size,
                                                                                    size_t min_coverage,
                                                                                    bool include_deletions,
                                                                                    bool std_err) {
-    unordered_map<string, map<size_t, pair<double, double>>> depth_index;
+    unordered_map<string, map<size_t, pair<float, float>>> depth_index;
     for (const string& path_name : path_names) {
         vector<tuple<size_t, size_t, double, double>> binned_depths = binned_packed_depth(packer, path_name, bin_size,
                                                                                           min_coverage, include_deletions);
         // todo: probably more efficent to just leave in sorted vector
-        map<size_t, pair<double, double>>& depth_map = depth_index[path_name];
+        map<size_t, pair<float, float>>& depth_map = depth_index[path_name];
         for (auto& binned_depth : binned_depths) {
             double var = get<3>(binned_depth);
             // optionally convert variance to standard error
@@ -155,14 +155,30 @@ unordered_map<string, map<size_t, pair<double, double>>> binned_packed_depth_ind
     return depth_index;
 }
 
-const pair<double, double>& get_depth_from_index(const unordered_map<string, map<size_t, pair<double, double>>>& depth_index,
-                                          const string& path_name, size_t offset) {
+const pair<float, float>& get_depth_from_index(const unordered_map<string, map<size_t, pair<float, float>>>& depth_index,
+                                               const string& path_name, size_t offset) {
 
     auto ub = depth_index.at(path_name).upper_bound(offset);
     --ub;
     return ub->second;
 }
 
+pair<float, float> get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t start_offset, size_t end_offset) {
+    auto ub = depth_index.at(path_name).upper_bound(start_offset);
+    --ub;
+    auto ub_end = depth_index.at(path_name).upper_bound(end_offset);
+    size_t count = 0;
+    pair<float, float> total = make_pair(0, 0);
+    for (auto cur = ub; cur != ub_end; ++cur, ++count) {
+        total.first += cur->second.first;
+        total.second += cur->second.second;
+    }
+    // todo: better way of combining?
+    total.first /= (double)count;
+    total.second /= (double)count;
+    return total;
+}
+
 // draw (roughly) max_nodes nodes from the graph using the random seed
 static unordered_map<nid_t, size_t> sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) {
     default_random_engine generator(random_seed);
diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp
index 9084949df36..c6e7ed3797f 100644
--- a/src/algorithms/coverage_depth.hpp
+++ b/src/algorithms/coverage_depth.hpp
@@ -38,7 +38,7 @@ vector<tuple<size_t, size_t, double, double>> binned_packed_depth(const Packer&
 
 /// Use the above function to retrieve the binned depths of a list of paths, and store them indexed by start
 /// coordinate.  If std_err is true, store <mean, stderr> instead of <mean, variance>
-using BinnedDepthIndex = unordered_map<string, map<size_t, pair<double, double>>>;
+using BinnedDepthIndex = unordered_map<string, map<size_t, pair<float, float>>>;
 BinnedDepthIndex binned_packed_depth_index(const Packer& packer,
                                            const vector<string>& path_names,
                                            size_t bin_size,
@@ -47,8 +47,8 @@ BinnedDepthIndex binned_packed_depth_index(const Packer& packer,
                                            bool std_err);
 
 /// Query index created above
-/// Todo: optionally smooth over adjacent bins?
-const pair<double, double>& get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t offset);
+const pair<float, float>& get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t offset);
+pair<float, float> get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t start_offset, size_t end_offset);
 
 /// Return the mean and variance of coverage of randomly sampled nodes from a GAM
 /// Nodes with less than min_coverage are ignored
diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index b8ae480e2f6..b9f9356d715 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -297,8 +297,10 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         shared_travs.push_back(genotype[0]);
     }
     // compute the support of our called alleles
-    // todo: I think this undercounts support.  shuold be fixed (as in Poisson version)
     vector<Support> allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0);
+    
+    // Compute the total support for all the alts that will be appearing
+    Support total_support = std::accumulate(allele_supports.begin(), allele_supports.end(), Support());    
         
     // Set up the depth format field
     variant.format.push_back("DP");
@@ -309,9 +311,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     variant.format.push_back("XADL");
     // Also the alt allele depth
     variant.format.push_back("XAAD");
-
-    // Compute the total support for all the alts that will be appearing
-    Support total_support = support_finder.get_total_traversal_set_support(traversals, 0);
+    
     // And total alt allele depth for the alt alleles
     Support alt_support;
     // Find the min total support of anything called
@@ -536,10 +536,9 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     }
 
     // expected depth from our coverage
-    const pair<double, double>& start_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first);
-    const pair<double, double>& end_depth = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.second);
-    double exp_depth = (start_depth.first + end_depth.first) / 2.;
-    double depth_err = (start_depth.second + end_depth.second) / 2.;
+    auto depth_info = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first, ref_range.second);
+    double exp_depth = depth_info.first;
+    double depth_err = depth_info.second;
     assert(!isnan(exp_depth) && !isnan(depth_err));
 
     // genotype (log) likelihoods
@@ -565,23 +564,21 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     
     assert(genotype.size() == 1 || genotype.size() == 2);
 
-    // get the total support over the site
-    // todo: bump this to calling method to not recompute for each genotype!!!
-    Support total_site_support = support_finder.get_total_traversal_set_support(traversals, ref_trav_idx);
-
     // get the genotype support
-    // todo : we aren't using the non-genotype allele supports in this method, add flag to not compute them here!
     vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, ref_trav_idx);
 
+    // get the total support over the site
+    Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support());    
+
     // get the total support of traversals *not* in the genotype
-    // note that if we sum it up from allele_supports, it will likely be underestimated when using min (instead of avg supports)
-    // so we subtract it out of the total instead
-    Support total_other_support = total_site_support;
+    Support total_other_support;
     set<int> genotype_set(genotype.begin(), genotype.end());
-    for (int allele : genotype_set) {
-        total_other_support += -1. * genotype_supports[allele];
+    for (int i = 0; i < traversals.size(); ++i) {
+        if (!genotype_set.count(i)) {
+            total_other_support += genotype_supports[i];
+        }
     }
-
+ 
     // split the homozygous support into two
     // from now on we'll treat it like two separate observations, each with half coverage
     vector<Support> fixed_genotype_supports = genotype_supports;
@@ -641,10 +638,11 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
 
     assert(traversals.size() == variant.alleles.size());
 
-    // Get the depth of the site
-    // todo: pass this down to genotype_likelihood
+    // get the genotype support
+    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0);
 
-    Support total_site_support = support_finder.get_total_traversal_set_support(traversals, 0);    
+    // Get the depth of the site
+    Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support());    
     double total_site_depth = support_val(total_site_support);
 
     // Set the variant's total depth            
@@ -658,8 +656,6 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     // get the allele depths
     variant.format.push_back("AD");
 
-    // get the genotype support
-    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0);
     set<int> genotype_set(genotype.begin(), genotype.end());
     double min_site_support = genotype.size() > 0 ? INFINITY : 0;
 
@@ -682,10 +678,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
 
     // expected depth from our coverage
     pair<size_t, size_t> ref_range = make_pair(variant.position, variant.position + variant.ref.length());
-    const pair<double, double>& start_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first);
-    const pair<double, double>& end_depth = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.second);
-    double exp_depth = (start_depth.first + end_depth.first) / 2.;
-    double depth_err = (start_depth.second + end_depth.second) / 2.;
+    auto depth_info = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first, ref_range.second);
+    double exp_depth = depth_info.first;
+    double depth_err = depth_info.second;
     assert(!isnan(exp_depth) && !isnan(depth_err));
 
     // assume ploidy 2
diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp
index a3ce23539a5..2bcb478b7c6 100644
--- a/src/traversal_support.cpp
+++ b/src/traversal_support.cpp
@@ -63,26 +63,6 @@ Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& trav
     return get_traversal_set_support({traversal}, {}, {}, false, false, false).at(0);
 }
 
-Support TraversalSupportFinder::get_total_traversal_set_support(const vector<SnarlTraversal>& traversals,
-                                                                int ref_trav_idx) const {
-    // share everything
-    vector<int> shared_travs(traversals.size());
-    for (int i = 0; i < shared_travs.size(); ++i) {
-        shared_travs[i] = i;
-    }
-
-    // get the support of everything, where all shared nodes and edges are scaled by the number of times they're shared
-    vector<Support> supports = get_traversal_set_support(traversals, shared_travs, {}, false, false, true, ref_trav_idx);
-
-    // sum it up
-    Support total;
-    for (const Support& support : supports) {
-        total += support;
-    }
-
-    return total;
-}
-
 vector<Support> TraversalSupportFinder::get_traversal_genotype_support(const vector<SnarlTraversal>& traversals,
                                                                        const vector<int>& genotype,
                                                                        int ref_trav_idx) {
diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp
index ff2051cbb5c..42fd10607bb 100644
--- a/src/traversal_support.hpp
+++ b/src/traversal_support.hpp
@@ -45,13 +45,7 @@ class TraversalSupportFinder {
     /// Get the support of a traversal
     /// Child snarls are handled as in the old call code: their maximum support is used
     virtual Support get_traversal_support(const SnarlTraversal& traversal) const;
-
-    /// wrapper for using get_traversal_set_support to get the total support
-    /// (sets shared_travs to the whole set, mutual_shared to true, then
-    /// sums over the results)
-    virtual Support get_total_traversal_set_support(const vector<SnarlTraversal>& traversals,
-                                                    int ref_trav_idx = -1) const;
-
+ 
     /// wrapper for using get_traversal_set_support to get the support for
     /// some alleles in a genotype, where everything is split evently among them
     /// anything not in the genotype gets a support using "exclusive_count"

From aad52e4e2c98b39054e1b67a54d9a3ad98d7513d Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 12 Nov 2019 15:55:33 -0500
Subject: [PATCH 43/79] turn bin size way down

---
 src/snarl_caller.cpp         | 2 +-
 src/subcommand/call_main.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index b9f9356d715..6556bbb7bb4 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -589,7 +589,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     }
     
     // how many reads would we expect to not map to our genotype due to error
-    double error_rate = std::min(0.25, depth_err + baseline_mapping_error);
+    double error_rate = std::min(0.05, depth_err + baseline_mapping_error);
     double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support);
 
     // and our likelihood for the unmapped reads we see:
diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index 3f69185b0c9..baae2060516 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -273,7 +273,7 @@ int main_call(int argc, char** argv) {
 
         if (ratio_caller == false) {
             // Make a depth index
-            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 500000, 0, true, true);
+            depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, 50, 0, true, true);
             // Make a new-stype probablistic caller
             auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *packed_support_finder, depth_index);
             packed_caller = poisson_caller;

From 21fcfafc1120263e61488e71a6eee1b873c8fbde Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 12 Nov 2019 16:57:31 -0500
Subject: [PATCH 44/79] fix vcf header

---
 src/snarl_caller.cpp | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 6556bbb7bb4..bb55e987755 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -670,10 +670,8 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     }
 
     // get the genotype likelihoods
-    // as above, there's some overlap in these computations as those used in genotype() to begin with
-    // this is an issue with the class interface which probably tries too hard to avoid being VCF-dependent
-    // but if it causes a slowdown (hasn't seemed to be a factor so far), the code could be re-organized
-    // to either store some of this information, or comptue the genotype and vcf fields in a single shot
+    vector<double> gen_likelihoods;
+    double gen_likelihood;    
     variant.format.push_back("GL");
 
     // expected depth from our coverage
@@ -687,12 +685,33 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     for (int i = 0; i < traversals.size(); ++i) {
         for (int j = i; j < traversals.size(); ++j) {
             double gl = genotype_likelihood({i, j}, traversals, 0, exp_depth, depth_err);
+            gen_likelihoods.push_back(gl);
+            if (vector<int>({i, j}) == genotype || vector<int>({j,i}) == genotype) {
+                gen_likelihood = gl;
+            }
             // convert from natural log to log10 by dividing by ln(10)
             variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258));
         }
     }
 
-    // use old quality for now
+    // get the GQ
+    double prior = log(1. / (double)traversals.size());
+    double p_reads = prior + gen_likelihoods[0];
+    // note that we should be summing over all the likelihoods as considered in genotype()
+    // todo: figure out a way how to move this to that method.
+    // (or make sure more uncalled stuff makes it into the vcf so we have more traversals to sum over here)
+    /*
+    for (int i = 1; i < gen_likelihoods.size(); ++i) {
+        p_reads = add_log(p_reads, prior + gen_likelihoods[i]);
+    }
+    double posterior = gen_likelihood + prior - p_reads;
+    double gq = logprob_to_phred(logprob_invert(posterior));
+    variant.format.push_back("GQ");
+    variant.samples[sample_name]["GQ"].push_back(std::to_string(gq));
+    */
+
+    // our old min-support based quality as hack until
+    // qual / gq properly sorted out
     variant.quality = min_site_support;
 
     // Now do the filters
@@ -712,6 +731,11 @@ void PoissonSupportSnarlCaller::update_vcf_header(string& header) const {
     header += "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n";
     header += "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n";
     header += "##FORMAT=<ID=GL,Number=G,Type=Float,Description=\"Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy\">\n";
+    //header += "##FORMAT=<ID=GQ,Number=1,Type=Float,Description=\"Genotype Quality, the Phred-scaled probability of the called genotype\">\n";
+    header += "##FILTER=<ID=lowad,Description=\"Variant does not meet minimum allele read support threshold of " +
+        std::to_string(min_mad_for_filter) + "\">\n";
+    header += "##FILTER=<ID=lowdepth,Description=\"Variant has read depth less than " +
+        std::to_string(min_site_depth) + "\">\n";    
 }
 
 vector<int> PoissonSupportSnarlCaller::rank_by_support(const vector<Support>& supports) {

From 3e5c17910b147ddb24c48823de7edf7ebe9bf747 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Tue, 12 Nov 2019 14:31:45 -0800
Subject: [PATCH 45/79] Stopped copying seeds

---
 src/seed_clusterer.cpp          | 151 +++++++++++++-------------------
 src/seed_clusterer.hpp          |  34 +++----
 src/unittest/seed_clusterer.cpp |  52 +++++++++--
 3 files changed, 124 insertions(+), 113 deletions(-)

diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index dd16f19c8c5..d4597a0b31d 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -2,7 +2,7 @@
 
 #include <algorithm>
 
-#define DEBUG_CLUSTER
+//#define DEBUG_CLUSTER
 
 namespace vg {
 
@@ -10,16 +10,19 @@ namespace vg {
                                             dist_index(dist_index){
     };
 
-    SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (vector<pos_t> seeds, int64_t read_distance_limit) const {
+    SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (const vector<pos_t>& seeds, int64_t read_distance_limit) const {
+
         vector<vector<pos_t>> all_seeds;
         all_seeds.push_back(seeds);
+
         tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> all_clusters =
             cluster_seeds(all_seeds, read_distance_limit, 0);
+
         return std::get<0>(all_clusters)[0];
     };
 
     tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds (
-                  vector<vector<pos_t>>& all_seeds, int64_t read_distance_limit,
+                  const vector<vector<pos_t>>& all_seeds, int64_t read_distance_limit,
                   int64_t fragment_distance_limit) const {
         /* Given a vector of seeds and a limit, find a clustering of seeds where
          * seeds that are closer than the limit cluster together.
@@ -34,11 +37,9 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl;
         }
 
         //For each level of the snarl tree, maps snarls (index into
-        //dist_index.snarl_indexes) at that level to
-        //nodes belonging to the snarl
+        //dist_index.snarl_indexes) at that level to nodes belonging to the snarl
         //This is later used to populate snarl_to_node in the tree state
-        vector<hash_map<size_t, vector<pair<NetgraphNode, NodeClusters>>>>
-                                                  snarl_to_nodes_by_level;
+        vector<hash_map<size_t, vector<pair<NetgraphNode, NodeClusters>>>> snarl_to_nodes_by_level;
         snarl_to_nodes_by_level.resize(dist_index.tree_depth+1);
 
 
@@ -55,10 +56,7 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl;
         get_nodes(tree_state, snarl_to_nodes_by_level);
 
         //Initialize the tree state to be the bottom level
-        if (dist_index.tree_depth >= 0) {
-            tree_state.snarl_to_nodes =
-                          move(snarl_to_nodes_by_level[dist_index.tree_depth]);
-        }
+        tree_state.snarl_to_nodes = std::move(snarl_to_nodes_by_level[dist_index.tree_depth]);
 
         for (int depth = dist_index.tree_depth ; depth >= 0 ; depth --) {
             //Go through each level of the tree, bottom up, and cluster that
@@ -67,12 +65,10 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl;
             //
             // tree_state knows all children of the snarls at this level
 
+            // Bring in the direct child nodes that come in at this level in the snarl tree.
+            // They only ever occur below the root.
             if (depth != 0) {
-                // Bring in the direct child nodes that come in at this level
-                //  in the snarl tree.
-                // They only ever occur below the root.
-                tree_state.parent_snarl_to_nodes =
-                                       move(snarl_to_nodes_by_level[depth - 1]);
+                tree_state.parent_snarl_to_nodes = std::move(snarl_to_nodes_by_level[depth - 1]);
             }
 
 #ifdef DEBUG_CLUSTER
@@ -90,7 +86,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             cluster_chain_level(tree_state, depth);
 
             // Swap buffer over for the next level
-            tree_state.snarl_to_nodes = move(tree_state.parent_snarl_to_nodes);
+            tree_state.snarl_to_nodes = std::move(tree_state.parent_snarl_to_nodes);
             tree_state.chain_to_snarls.clear();
         }
 
@@ -143,9 +139,10 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
         // Assign each seed to a node.
         hash_set<id_t> seen_nodes;
         for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ 
-            vector<pos_t>& seeds = tree_state.all_seeds->at(read_num);
+            const vector<pos_t>& seeds = tree_state.all_seeds->at(read_num);
             for (size_t i = 0; i < seeds.size(); i++) {
-                id_t id = get_id(seeds.at(i));
+                pos_t pos = seeds.at(i);
+                id_t id = get_id(pos);
 
                 //Assign the seed to a node
                 tree_state.node_to_seeds[read_num].emplace_back(id, i);
@@ -211,13 +208,10 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                     assert(snarl_index.parent_id >= dist_index.min_node_id);
                     assert(snarl_index.parent_id <= dist_index.max_node_id);
 #endif
-                    size_t parent_snarl_i =
-                           dist_index.getPrimaryAssignment(
-                                                snarl_index.parent_id);
+                    size_t parent_snarl_i = dist_index.getPrimaryAssignment( snarl_index.parent_id);
 
                     tree_state.parent_snarl_to_nodes[parent_snarl_i].emplace_back(
-                            NetgraphNode (snarl_i, SNARL),
-                            cluster_one_snarl(tree_state, snarl_i));
+                            NetgraphNode (snarl_i, SNARL), cluster_one_snarl(tree_state, snarl_i));
 
 #ifdef DEBUG_CLUSTER
                     cerr << "Recording snarl number " << snarl_i
@@ -262,10 +256,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 // Find the node ID that heads the parent of that chain.
                 size_t parent_id = dist_index.chain_indexes[chain_i].parent_id;
                 // It must be a legitimate node ID we cover.
-#ifdef DEBUG_CLUSTER
                 assert(parent_id >= dist_index.min_node_id);
                 assert(parent_id <= dist_index.max_node_id);
-#endif
+
                 // Map it to the snarl number that should be represented by it
                 // (and thus also contain the chain)
                 size_t parent_snarl_i = dist_index.getPrimaryAssignment(parent_id);
@@ -318,10 +311,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                         //ends of the node
 
                         pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
-                        int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed)
-                                                         : get_offset(seed) + 1;
-                        int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1
-                                                       : node_length - get_offset(seed);
+                        int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) : get_offset(seed) + 1;
+                        int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 : node_length - get_offset(seed);
 
                         node_clusters.read_best_left[read_num] = min_not_minus_one(dist_left,
                                                               node_clusters.read_best_left[read_num]);
@@ -334,8 +325,11 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
 
                         tree_state.read_union_find[read_num].union_groups(group_id, iter->second);
                         if (tree_state.fragment_distance_limit != 0 ) {
-                            if (fragment_group_id == -1 ) fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
-                            tree_state.fragment_union_find.union_groups(fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]);
+                            if (fragment_group_id == -1 ) {
+                                fragment_group_id = seed_range_start->second + tree_state.read_index_offsets[read_num];
+                            }
+                            tree_state.fragment_union_find.union_groups(
+                                    fragment_group_id, iter->second + tree_state.read_index_offsets[read_num]);
                         }
 
                     }
@@ -352,7 +346,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 }
             }
 #ifdef DEBUG_CLUSTER
-            cerr << "Found single cluster on node " << node_id << " with fragment dists " << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl;
+            cerr << "Found single cluster on node " << node_id << " with fragment dists " 
+                    << node_clusters.fragment_best_left << " " << node_clusters.fragment_best_right << endl;
 
             bool got_left = false;
             bool got_right = false;
@@ -587,7 +582,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                     combined_left[read_num] = min_not_minus_one(combined_left[read_num], dists.first);
                     combined_right[read_num] = min_not_minus_one(combined_right[read_num], dists.second);
                 }
-                cerr << "COMBINING READ: " ;
                 if (tree_state.fragment_distance_limit != 0) {
                     if (fragment_combined_group != -1) {
                         tree_state.fragment_union_find.union_groups(fragment_combined_group, 
@@ -595,7 +589,6 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                     }
                     fragment_combined_group = tree_state.fragment_union_find.find_group(
                                 cluster_group+tree_state.read_index_offsets[read_num]);
-                    cerr << "   AND FRAGMENT" << endl;
                 }
                 return true;
             } else if (fragment_dist != -1 &&
@@ -1208,16 +1201,12 @@ cerr << "  Maybe combining this cluster from the right" << endl;
 
                     //Update distances and cluster head of new cluster
                     size_t new_g = tree_state.read_union_find[read_num].find_group(new_group);
-                    if (new_g != new_group) {
-                        snarl_clusters.read_cluster_heads.erase(make_pair(read_num,new_group));
-                    }
-                    if (new_g != combined_group) {
-                        snarl_clusters.read_cluster_heads.erase(make_pair(read_num,combined_group));
-                    }
+                    if (new_g != new_group) snarl_clusters.read_cluster_heads.erase(make_pair(read_num,new_group));
+                    if (new_g != combined_group) snarl_clusters.read_cluster_heads.erase(make_pair(read_num,combined_group));
+                    
                     snarl_clusters.read_cluster_heads.emplace(read_num,new_g);
-                    end_dists = make_pair(
-                                min_not_minus_one(end_dists.first, old_dists.first),
-                                min_not_minus_one(end_dists.second, old_dists.second));
+                    end_dists = make_pair( min_not_minus_one(end_dists.first, old_dists.first),
+                                           min_not_minus_one(end_dists.second, old_dists.second));
                     tree_state.read_cluster_dists[read_num][new_g] = end_dists;
                     new_group = new_g;
                     combined_group = new_g;
@@ -1239,8 +1228,7 @@ cerr << "  Maybe combining this cluster from the right" << endl;
 
 
         //Get the children of this snarl and their clusters
-        vector<pair<NetgraphNode, NodeClusters>>& child_nodes =
-                                       tree_state.snarl_to_nodes[snarl_index_i];
+        vector<pair<NetgraphNode, NodeClusters>>& child_nodes = tree_state.snarl_to_nodes[snarl_index_i];
         int64_t start_length = snarl_index.nodeLength(0);
         int64_t end_length = snarl_index.nodeLength(snarl_index.num_nodes*2 -1);
 
@@ -1250,7 +1238,7 @@ cerr << "  Maybe combining this cluster from the right" << endl;
         hash_map<pair<size_t,size_t>, pair<int64_t, int64_t>> old_dists;
 
         for (size_t i = 0; i < child_nodes.size() ; i++) {
-            //Go through each child node of the netgraph and get clusters
+            //Go through each child node of the netgraph
 
             NetgraphNode& child = child_nodes [i].first;
 
@@ -1259,18 +1247,15 @@ cerr << "  Maybe combining this cluster from the right" << endl;
             id_t child_node_id = child.id_in_parent(dist_index);
 
             //Rank of this node in the snarl
-            //If this node is a snarl/chain, then this snarl will be the
-            //secondary snarl
+            //Note, if this node is a snarl/chain, then this snarl will be the secondary snarl
             size_t node_rank = child.rank_in_parent(dist_index, child_node_id);
-            size_t rev_rank = node_rank % 2 == 0
-                           ? node_rank + 1 : node_rank - 1;
+            size_t rev_rank = node_rank % 2 == 0 ? node_rank + 1 : node_rank - 1;
 
             if (child.node_type == NODE) {
                 //If this node is a node, we need to find the clusters
                 int64_t node_len = snarl_index.nodeLength(node_rank);
 
-                child_nodes[i].second = cluster_one_node(
-                                     tree_state, child_node_id, node_len);
+                child_nodes[i].second = cluster_one_node(tree_state, child_node_id, node_len);
 
             }
             //Represents all the clusters on this child node
@@ -1343,10 +1328,8 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa
 
                 id_t other_node_id = other_node.id_in_parent(dist_index);
                 //Rank of this node in the snarl
-                size_t other_rank = other_node.rank_in_parent(dist_index,
-                                                              other_node_id);
-                size_t other_rev = other_rank % 2 == 0
-                                    ? other_rank + 1 : other_rank - 1;
+                size_t other_rank = other_node.rank_in_parent(dist_index, other_node_id);
+                size_t other_rev = other_rank % 2 == 0 ? other_rank + 1 : other_rank - 1;
 
 #ifdef DEBUG_CLUSTER
                 cerr << "Other net graph node is " << typeToString(other_node.node_type)
@@ -1358,14 +1341,11 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa
 
                 //Find distance from each end of current node (i) to
                 //each end of other node (j)
-                int64_t dist_l_l = snarl_index.snarlDistance(
-                                                     rev_rank, other_rank);
-                int64_t dist_l_r = snarl_index.snarlDistance(
-                                                      rev_rank, other_rev);
-                int64_t dist_r_l = snarl_index.snarlDistance(
-                                                    node_rank, other_rank);
-                int64_t dist_r_r = snarl_index.snarlDistance(
-                                                     node_rank, other_rev);
+                int64_t dist_l_l = snarl_index.snarlDistance(rev_rank, other_rank);
+                int64_t dist_l_r = snarl_index.snarlDistance(rev_rank, other_rev);
+                int64_t dist_r_l = snarl_index.snarlDistance(node_rank, other_rank);
+                int64_t dist_r_r = snarl_index.snarlDistance(node_rank, other_rev);
+
 #ifdef DEBUG_CLUSTER
 cerr << "\t distances between ranks " << node_rank << " and " << other_rank
      << ": " << dist_l_l << " " << dist_l_r << " " << dist_r_l << " "
@@ -1385,13 +1365,11 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
 
                 if (max({dist_l_l, dist_l_r, dist_r_l, dist_r_r}) != -1
                    && ((tree_state.fragment_distance_limit == 0 &&
-                         MinimumDistanceIndex::minPos({dist_l_l, dist_l_r,
-                            dist_r_l, dist_r_r})-2 <= tree_state.read_distance_limit
+                         MinimumDistanceIndex::minPos({dist_l_l, dist_l_r, dist_r_l, dist_r_r})-2 <= tree_state.read_distance_limit
                    && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2
                                                 <= tree_state.read_distance_limit) ||
                        (tree_state.fragment_distance_limit != 0 &&
-                            MinimumDistanceIndex::minPos({dist_l_l, dist_l_r,
-                            dist_r_l, dist_r_r})-2 <= tree_state.fragment_distance_limit
+                            MinimumDistanceIndex::minPos({dist_l_l, dist_l_r,dist_r_l, dist_r_r})-2 <= tree_state.fragment_distance_limit
                    && min_not_minus_one(curr_child_clusters.fragment_best_left, curr_child_clusters.fragment_best_right)-2
                                                 <= tree_state.fragment_distance_limit)
                                                 )) {
@@ -1405,8 +1383,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                         pair<int64_t, int64_t> dists_c = old_dists[child_cluster_head];
 
 
-                        if (dist_l_l != -1 && dists_c.first != -1
-                                 && other_node_clusters.fragment_best_left != -1 ) {
+                        if (dist_l_l != -1 && dists_c.first != -1 && other_node_clusters.fragment_best_left != -1 ) {
                             //If cluster child_cluster_head can be combined with clusters in j
                             //from the left of both of them
                             int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 :
@@ -1416,8 +1393,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                                   fragment_dist, read_dist,  read_num);
                         }
 
-                        if (dist_l_r != -1 && dists_c.first != -1
-                            && other_node_clusters.fragment_best_right != -1 ) {
+                        if (dist_l_r != -1 && dists_c.first != -1 && other_node_clusters.fragment_best_right != -1 ) {
                             //If it can be combined from the left to the right of j
                             int64_t fragment_dist = dist_l_r + dists_c.first + other_node_clusters.fragment_best_right-1;
                             int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 :
@@ -1425,16 +1401,14 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                             combine_clusters(c_group, group_l_r[read_num], fragment_group_l_r,
                                  fragment_dist, read_dist, read_num);
                         }
-                        if (dist_r_l != -1 && dists_c.second != -1
-                            && other_node_clusters.fragment_best_left != -1 ) {
+                        if (dist_r_l != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_left != -1 ) {
                             int64_t fragment_dist = dist_r_l + dists_c.second + other_node_clusters.fragment_best_left-1;
                             int64_t read_dist = other_node_clusters.read_best_left[read_num] == -1 ? -1 :
                                 dist_r_l + dists_c.second + other_node_clusters.read_best_left[read_num]-1;
                             combine_clusters(c_group, group_r_l[read_num], fragment_group_r_l,
                                 fragment_dist, read_dist,  read_num);
                         }
-                        if (dist_r_r != -1 && dists_c.second != -1
-                            && other_node_clusters.fragment_best_right != -1 ) {
+                        if (dist_r_r != -1 && dists_c.second != -1 && other_node_clusters.fragment_best_right != -1 ) {
                             int64_t fragment_dist = dist_r_r + dists_c.second + other_node_clusters.fragment_best_right-1;
                             int64_t read_dist = other_node_clusters.read_best_right[read_num] == -1 ? -1 :
                                 dist_r_r + dists_c.second + other_node_clusters.read_best_right[read_num]-1;
@@ -1443,17 +1417,16 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                         }
 
                     }
-                    //Go through children of j
+
+                    //Go through clusters of child node j
                     vector<pair<size_t, size_t>> children_j(
                              make_move_iterator(other_node_clusters.read_cluster_heads.begin()),
                              make_move_iterator(other_node_clusters.read_cluster_heads.end()));
 
                     for (size_t k_i = 0 ; k_i < children_j.size() ; k_i++){
-                        //For each cluster of child j, find which overlaps with
-                        //clusters of i
-                        //child_cluster_head will already be part of a cluster in
-                        //snarlcluster heads but since we need to know the node
-                        //that the snarl is on we can't just loop through
+                        //For each cluster of child j, find which overlaps with clusters of i
+                        //child_cluster_head will already be part of a cluster in snarl_cluster_heads but 
+                        //since we need to know the node that the snarl is on we can't just loop through 
                         //snarl_cluster heads
                         pair<size_t,size_t> child_cluster_head = children_j[k_i];
                         size_t read_num = child_cluster_head.first;
@@ -1461,8 +1434,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                         size_t k_group = tree_state.read_union_find[read_num].find_group(child_cluster_head.second);
 
 
-                        if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1
-                           && dists_k.first != -1 ){
+                        if (dist_l_l != -1 && curr_child_clusters.fragment_best_left != -1 && dists_k.first != -1 ){
 
                             int64_t fragment_dist = dist_l_l + curr_child_clusters.fragment_best_left + dists_k.first-1;
                             int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 :
@@ -1470,8 +1442,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                             combine_clusters(k_group, group_l_l[read_num], fragment_group_l_l,
                                 fragment_dist,read_dist, read_num);
                         }
-                        if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1
-                             && dists_k.second != -1  ) {
+                        if (dist_l_r != -1 && curr_child_clusters.fragment_best_left != -1 && dists_k.second != -1  ) {
 
                             int64_t read_dist = curr_child_clusters.read_best_left[read_num] == -1 ? -1 :
                                dist_l_r + curr_child_clusters.read_best_left[read_num] + dists_k.second-1;
@@ -1479,8 +1450,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                             combine_clusters(k_group, group_l_r[read_num], fragment_group_l_r,
                                fragment_dist, read_dist, read_num);
                         }
-                        if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1
-                            && dists_k.first != -1  ) {
+                        if (dist_r_l != -1 && curr_child_clusters.fragment_best_right != -1 && dists_k.first != -1  ) {
 
                             int64_t fragment_dist = dist_r_l + curr_child_clusters.fragment_best_right + dists_k.first-1;
                             int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 :
@@ -1488,8 +1458,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                             combine_clusters(k_group, group_r_l[read_num], fragment_group_r_l,
                                 fragment_dist, read_dist, read_num);
                         }
-                        if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1
-                           && dists_k.second != -1 ) {
+                        if (dist_r_r != -1 && curr_child_clusters.fragment_best_right != -1 && dists_k.second != -1 ) {
 
                             int64_t fragment_dist = dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1;
                             int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 :
diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp
index e35f877d4a7..fd07b696368 100644
--- a/src/seed_clusterer.hpp
+++ b/src/seed_clusterer.hpp
@@ -14,7 +14,7 @@ class SnarlSeedClusterer {
 
         SnarlSeedClusterer(MinimumDistanceIndex& dist_index);
 
-        //Represents all clusters for one vector of seeds
+        //Represents all clusters for one vector of seeds (corresponding to a read)
         //Each cluster is a vector of indexes into the vector of seeds 
         typedef vector<vector<size_t>> cluster_group_t;
 
@@ -22,7 +22,7 @@ class SnarlSeedClusterer {
         //cluster the seeds such that two seeds whose minimum distance
         //between them (including both of the positions) is less than
         // the distance limit are in the same cluster
-        cluster_group_t cluster_seeds ( vector<pos_t> seeds, int64_t read_distance_limit) const;
+        cluster_group_t cluster_seeds ( const vector<pos_t>& seeds, int64_t read_distance_limit) const;
         
         ///The same thing, but for paired end reads.
         //Given seeds from multiple reads of a fragment, cluster each read
@@ -33,7 +33,7 @@ class SnarlSeedClusterer {
         //The fragment clusters give seeds the index they would get if the vectors of
         // seeds were appended to each other in the order given
         tuple<vector<cluster_group_t>, cluster_group_t> cluster_seeds ( 
-                vector<vector<pos_t>>& all_seeds,
+                const vector<vector<pos_t>>& all_seeds,
                 int64_t read_distance_limit, int64_t fragment_distance_limit=0) const;
 
     private:
@@ -57,7 +57,9 @@ class SnarlSeedClusterer {
         }
 
         struct NetgraphNode {
-            //child nodes of a snarl's netgraph 
+            //Represents a child node of a snarl's netgraph 
+
+
             //node_id is the node id if the node is just a node, index into
             //dist_index's snarl_indexes/chain_index if it is a snarl/chain
             size_t node_id; 
@@ -87,9 +89,8 @@ class SnarlSeedClusterer {
                 //Get the forward rank of this node in the parent's netgraph
                 //to look up distances
 
-                size_t rank = node_type == NODE ? 
-                                    dist_index.getPrimaryRank(id) :
-                                    dist_index.getSecondaryRank(id);
+                size_t rank = node_type == NODE ? dist_index.getPrimaryRank(id) :
+                                                  dist_index.getSecondaryRank(id);
                 if ( (node_type == SNARL && 
                       dist_index.snarl_indexes[dist_index.getPrimaryAssignment(id)].rev_in_parent) ||
                      (node_type == CHAIN && 
@@ -101,7 +102,7 @@ class SnarlSeedClusterer {
         };
 
         struct NodeClusters {
-            //Clusters in the context of a snarl tree node
+            //All clusters of a snarl tree node
             //The node containing this struct may be an actual node,
             // snarl/chain that is a node the parent snarl's netgraph,
             // or a snarl in a chain
@@ -134,7 +135,7 @@ class SnarlSeedClusterer {
             //is updated to know about its children
 
             //Vector of all the seeds for each read
-            vector<vector<pos_t>>* all_seeds; 
+            const vector<vector<pos_t>>* all_seeds; 
 
             //prefix sum vector of the number of seeds per read
             //To get the index of a seed for the fragment clusters
@@ -171,8 +172,7 @@ class SnarlSeedClusterer {
             //Map from snarl (index into dist_index.snarl_indexes) i
             //to the netgraph nodes contained in the snarl as well as the 
             //clusters at the node
-            hash_map<size_t,vector<pair<NetgraphNode, NodeClusters>>>
-                                                            snarl_to_nodes;
+            hash_map<size_t,vector<pair<NetgraphNode, NodeClusters>>> snarl_to_nodes;
             
             //Map each chain to the snarls (only ones that contain seeds) that
             //comprise it. 
@@ -181,19 +181,17 @@ class SnarlSeedClusterer {
             //Map maps the rank of the snarl to the snarl and snarl's clusters
             //  Since maps are ordered, it will be in the order of traversal
             //  of the snarls in the chain
-            hash_map<size_t, std::map<size_t, pair<size_t, NodeClusters>>>
-                                                         chain_to_snarls;
+            hash_map<size_t, std::map<size_t, pair<size_t, NodeClusters>>> chain_to_snarls;
 
 
             //Same structure as snarl_to_nodes but for the level of the snarl
             //tree above the current one
             //This gets updated as the current level is processed
-            hash_map<size_t,vector<pair<NetgraphNode,NodeClusters>>>
-                                                          parent_snarl_to_nodes;
+            hash_map<size_t,vector<pair<NetgraphNode,NodeClusters>>> parent_snarl_to_nodes;
 
             //Constructor takes in a pointer to the seeds, the distance limits, and 
             //the total number of seeds in all_seeds
-            TreeState (vector<vector<pos_t>>* all_seeds, int64_t read_distance_limit, 
+            TreeState (const vector<vector<pos_t>>* all_seeds, int64_t read_distance_limit, 
                        int64_t fragment_distance_limit, size_t seed_count) :
                 all_seeds(all_seeds),
                 read_distance_limit(read_distance_limit),
@@ -201,11 +199,13 @@ class SnarlSeedClusterer {
                 fragment_union_find (seed_count, false),
                 read_index_offsets(1,0){
 
-                for (vector<pos_t>& v : *all_seeds) {
+                for (size_t i = 0 ; i < all_seeds->size() ; i++) {
+                    const vector<pos_t>& v = all_seeds->at(i);
                     size_t offset = read_index_offsets.back() + v.size();
                     read_index_offsets.push_back(offset);
                     read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1));
                     node_to_seeds.emplace_back();
+                    node_to_seeds.back().reserve(v.size());
                     read_union_find.emplace_back(v.size(), false);
                 }
             }
diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp
index e6fec99233c..8c4332d9d10 100644
--- a/src/unittest/seed_clusterer.cpp
+++ b/src/unittest/seed_clusterer.cpp
@@ -823,6 +823,49 @@ namespace unittest {
         }
     }//end test case
 
+
+    /*
+    TEST_CASE("Load graph", "[cluster]"){
+
+        ifstream vg_stream("testGraph");
+        VG vg(vg_stream);
+        vg_stream.close();
+        CactusSnarlFinder bubble_finder(vg);
+        SnarlManager snarl_manager = bubble_finder.find_snarls();
+
+        MinimumDistanceIndex dist_index (&vg, &snarl_manager);
+        SnarlSeedClusterer clusterer(dist_index);
+
+        int64_t read_lim = 20;// Distance between read clusters
+        int64_t fragment_lim = 30;// Distance between fragment clusters
+
+        vector<vector<pos_t>> all_seeds;
+        all_seeds.emplace_back();
+        all_seeds.emplace_back();
+
+
+        all_seeds[0].push_back(make_pos_t(206, true, 9));
+        all_seeds[0].push_back(make_pos_t(277, false, 1));
+        all_seeds[0].push_back(make_pos_t(263, true, 11));
+        all_seeds[0].push_back(make_pos_t(280, false, 10));
+        all_seeds[0].push_back(make_pos_t(279, true, 3));
+        all_seeds[0].push_back(make_pos_t(282, false, 0));
+        all_seeds[0].push_back(make_pos_t(300, false, 0));
+        all_seeds[0].push_back(make_pos_t(248, false, 0));
+        all_seeds[0].push_back(make_pos_t(245, false, 0));
+        all_seeds[0].push_back(make_pos_t(248, true, 0));
+
+        tuple<vector<vector<vector<size_t>>>, vector<vector<size_t>>> paired_clusters = 
+            clusterer.cluster_seeds(all_seeds, read_lim, fragment_lim); 
+        vector<vector<vector<size_t>>> read_clusters = std::get<0>(paired_clusters);
+        vector<vector<size_t>> fragment_clusters = std::get<1>(paired_clusters);
+        cerr << "read cluster: " << read_clusters[0].size() << endl << "fragment clusters: " << fragment_clusters.size() << endl;
+
+        REQUIRE(fragment_clusters.size() == 2);
+        REQUIRE((fragment_clusters[0].size() == 4 ||
+                fragment_clusters[1].size() == 4));
+    }//end test case
+    */
     TEST_CASE("Random graphs", "[cluster]"){
 
         for (int i = 0; i < 1000; i++) {
@@ -846,13 +889,14 @@ namespace unittest {
             uniform_int_distribution<int> randSnarlIndex(0, allSnarls.size()-1);
             default_random_engine generator(time(NULL));
             for (size_t k = 0; k < 1000 ; k++) {
+
                 vector<vector<pos_t>> all_seeds;
                 all_seeds.emplace_back();
                 all_seeds.emplace_back();
-                int64_t read_lim = 20;// Distance between read clusters
+                int64_t read_lim = 15;// Distance between read clusters
                 int64_t fragment_lim = 30;// Distance between fragment clusters
                 for (size_t read = 0 ; read < 2 ; read ++) {
-                    for (int j = 0; j < 20; j++) {
+                    for (int j = 0; j < 200; j++) {
                         //Check clusters of j random positions 
                         const Snarl* snarl1 = allSnarls[randSnarlIndex(generator)];
 
@@ -899,9 +943,7 @@ namespace unittest {
                             for (size_t i1 = 0 ; i1 < clust.size() ; i1++) {
                                 pos_t pos1 = all_seeds[read_num][clust[i1]];
                                 size_t len1 = graph.get_length(graph.get_handle(get_id(pos1), false));
-                                pos_t rev1 = make_pos_t(get_id(pos1), 
-                                                    !is_rev(pos1),
-                                                    len1 - get_offset(pos1)-1); 
+                                pos_t rev1 = make_pos_t(get_id(pos1), !is_rev(pos1),len1 - get_offset(pos1)-1); 
 
                                 for (size_t b = 0 ; b < one_read_clusters.size() ; b++) {
                                     if (b != a) {

From fc2cf1664beae36058a46e512f1c51551f72dd4e Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 12 Nov 2019 21:44:28 -0500
Subject: [PATCH 46/79] activate pruning in poisson caller.  also use iterative
 pruning if first try doesnt meet cutoff

---
 src/snarl_caller.cpp     | 20 ++++++++++----------
 src/snarl_caller.hpp     | 17 ++++++++---------
 src/traversal_finder.cpp |  6 +++---
 src/traversal_finder.hpp | 10 ++++++++--
 4 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index bb55e987755..0d307fc19a2 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -8,9 +8,9 @@ namespace vg {
 SnarlCaller::~SnarlCaller() {
 }
 
-function<bool(const SnarlTraversal&)> SnarlCaller::get_skip_allele_fn() const {
+function<bool(const SnarlTraversal&, int)> SnarlCaller::get_skip_allele_fn() const {
     // default implementation says don't skip anything
-    return [](const SnarlTraversal&) { return false; };
+    return [](const SnarlTraversal&, int) { assert(false); return false; };
 }
 
 SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
@@ -65,6 +65,14 @@ int SupportBasedSnarlCaller::get_best_support(const vector<Support>& supports, c
     return best_allele;
 }
 
+function<bool(const SnarlTraversal&, int)> SupportBasedSnarlCaller::get_skip_allele_fn() const {
+    // port over cutoff used in old support caller (there avg support used all the time, here
+    // we use the same toggles as when genotyping)
+    return [&](const SnarlTraversal& trav, int iteration) -> bool {
+        return support_val(support_finder.get_traversal_support(trav)) < pow(2, iteration) * min_alt_path_support;
+    };
+}
+
 RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager,
                                                  TraversalSupportFinder& support_finder) :
     SupportBasedSnarlCaller(graph, snarl_manager, support_finder)  {
@@ -404,14 +412,6 @@ void RatioSupportSnarlCaller::update_vcf_header(string& header) const {
         std::to_string(min_site_depth) + "\">\n";    
 }
 
-function<bool(const SnarlTraversal&)> RatioSupportSnarlCaller::get_skip_allele_fn() const {
-    // port over cutoff used in old support caller (there avg support used all the time, here
-    // we use the same toggles as when genotyping)
-    return [&](const SnarlTraversal& trav) -> bool {
-        return support_val(support_finder.get_traversal_support(trav)) < min_alt_path_support;
-    };
-}
-
 double RatioSupportSnarlCaller::get_bias(const vector<int>& traversal_sizes, int best_trav,
                                          int second_best_trav, int ref_trav_idx) const {
     bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) ||
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index 62f18563f56..d88c2bcd4d5 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -51,7 +51,7 @@ class SnarlCaller {
     virtual void update_vcf_header(string& header) const = 0;
 
     /// Optional method used for pruning searches
-    virtual function<bool(const SnarlTraversal&)> get_skip_allele_fn() const;
+    virtual function<bool(const SnarlTraversal&, int iteration)> get_skip_allele_fn() const;
 };
 
 /**
@@ -82,6 +82,9 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// Get the minimum total support for call
     virtual int get_min_total_support_for_call() const;
 
+    /// Use min_alt_path_support threshold as cutoff
+    virtual function<bool(const SnarlTraversal&, int iteration)> get_skip_allele_fn() const;
+
 protected:
 
     /// Get the best support out of a list of supports, ignoring skips
@@ -107,6 +110,9 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     /// what's the minimum total support (over all alleles) of the site to make
     /// a call
     size_t min_site_depth = 3;
+    /// used only for pruning alleles in the VCFTraversalFinder:  minimum support
+    /// of an allele's alt-path for it to be considered in the brute-force enumeration
+    double min_alt_path_support = 0.2;
 };
 
 
@@ -141,9 +147,6 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller {
     /// Define any header fields needed by the above
     virtual void update_vcf_header(string& header) const;
 
-    /// Use min_alt_path_support threshold as cutoff
-    virtual function<bool(const SnarlTraversal&)> get_skip_allele_fn() const;
-
 protected:
 
     /// Get the bias used to for comparing two traversals
@@ -171,11 +174,7 @@ class RatioSupportSnarlCaller : public SupportBasedSnarlCaller {
     /// the reference, the call is made.  set to 0 to deactivate.
     double max_ma_bias = 0;
     /// what's the min log likelihood for allele depth assignments to PASS?
-    double min_ad_log_likelihood_for_filter = -9;
-    /// used only for pruning alleles in the VCFTraversalFinder:  minimum support
-    /// of an allele's alt-path for it to be considered in the brute-force enumeration
-    double min_alt_path_support = 0.2;
-    
+    double min_ad_log_likelihood_for_filter = -9;    
 };
 
 /**
diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp
index 540bcdf54f7..74f40700101 100644
--- a/src/traversal_finder.cpp
+++ b/src/traversal_finder.cpp
@@ -2546,7 +2546,7 @@ VCFTraversalFinder::VCFTraversalFinder(const PathHandleGraph& graph, SnarlManage
                                        const vector<string>& ref_path_names,
                                        FastaReference* ref_fasta,
                                        FastaReference* ins_fasta,
-                                       function<bool(const SnarlTraversal&)> skip_alt,
+                                       function<bool(const SnarlTraversal&, int)> skip_alt,
                                        size_t max_traversal_cutoff) :
     graph(graph),
     snarl_manager(snarl_manager),
@@ -3248,7 +3248,7 @@ vector<vector<int>> VCFTraversalFinder::get_pruned_alt_alleles(
     }
 
     // only invoke pruning if we exceed our cutoff.  fairly rare on most graphs
-    if (!check_max_trav_cutoff(alt_alleles)) {
+    for (int prune_it = 0; prune_it < max_prune_iterations && !check_max_trav_cutoff(alt_alleles); ++prune_it) {
         for (auto& alleles : alt_alleles) {
             alleles.clear();
         }
@@ -3256,7 +3256,7 @@ vector<vector<int>> VCFTraversalFinder::get_pruned_alt_alleles(
         for (int var_i = 0; var_i < site_variants.size(); ++var_i) {
             for (int allele = 0; allele < site_variants[var_i]->alleles.size(); ++allele) {
                 if (skip_alt == nullptr ||
-                    skip_alt(get_alt_path(site_variants[var_i], allele, ref_path).first) == false) {
+                    skip_alt(get_alt_path(site_variants[var_i], allele, ref_path).first, prune_it) == false) {
                     alt_alleles[var_i].push_back(allele);
                 }
 #ifdef debug
diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp
index 4257ce58e13..573c17155c9 100644
--- a/src/traversal_finder.hpp
+++ b/src/traversal_finder.hpp
@@ -416,13 +416,19 @@ class VCFTraversalFinder : public TraversalFinder {
 
     /// Use this method to prune the search space by selecting alt-alleles
     /// to skip by considering their paths (in SnarlTraversal) format
-    function<bool(const SnarlTraversal& alt_path)> skip_alt;
+    /// It will try again and again until enough traversals are pruned,
+    /// with iteration keeping track of how many tries (so it should become stricter
+    /// as iteration increases)
+    function<bool(const SnarlTraversal& alt_path, int iteration)> skip_alt;
 
     /// If a snarl has more than this many traversals, return nothing and print
     /// a warning.  Dense and large deletions will make this happen from time
     /// to time.  In practice, skip_alt (above) can be used to prune down
     /// the search space by selecting alleles to ignore.
     size_t max_traversal_cutoff;
+
+    /// Maximum number of pruning iterations
+    size_t max_prune_iterations = 1000;
     
     /// Include snarl endpoints in traversals
     bool include_endpoints = true;
@@ -446,7 +452,7 @@ class VCFTraversalFinder : public TraversalFinder {
                        const vector<string>& ref_path_names = {},
                        FastaReference* fasta_ref = nullptr,
                        FastaReference* ins_ref = nullptr,
-                       function<bool(const SnarlTraversal&)> skip_alt = nullptr,
+                       function<bool(const SnarlTraversal&, int)> skip_alt = nullptr,
                        size_t max_traversal_cutoff = 500000);
         
     virtual ~VCFTraversalFinder();

From af07aad51963597186f6520993b3edc820022154 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang15@gmail.com>
Date: Wed, 13 Nov 2019 11:01:33 -0800
Subject: [PATCH 47/79] Fixed comments, weird indentations, etc

---
 scripts/giraffe-wrangler.sh     |   2 +-
 src/seed_clusterer.cpp          | 126 +++++++++++++++-----------------
 src/seed_clusterer.hpp          |  19 +++--
 src/subcommand/gaffe_main.cpp   |   4 +-
 src/unittest/seed_clusterer.cpp |   2 +-
 5 files changed, 76 insertions(+), 77 deletions(-)

diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh
index 5dfaedb3083..20f752148d9 100755
--- a/scripts/giraffe-wrangler.sh
+++ b/scripts/giraffe-wrangler.sh
@@ -91,7 +91,7 @@ echo "${SIM_GAM}"
 echo "${REAL_FASTQ}"
 
 # Define the Giraffe parameters
-GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 150 -a 4 -s 50 -u 0.4 -v 1 -w 20)
+GIRAFFE_OPTS=(-C 1500 -F 0.8 -e 300 -a 6 -s 50 -u 0.4 -v 1 -w 20)
 
 # Define a work directory
 # TODO: this requires GNU mptemp
diff --git a/src/seed_clusterer.cpp b/src/seed_clusterer.cpp
index d4597a0b31d..16a6a9ae7a2 100644
--- a/src/seed_clusterer.cpp
+++ b/src/seed_clusterer.cpp
@@ -11,9 +11,10 @@ namespace vg {
     };
 
     SnarlSeedClusterer::cluster_group_t SnarlSeedClusterer::cluster_seeds (const vector<pos_t>& seeds, int64_t read_distance_limit) const {
+        //Wrapper for single ended
 
-        vector<vector<pos_t>> all_seeds;
-        all_seeds.push_back(seeds);
+        vector<const vector<pos_t>*> all_seeds;
+        all_seeds.push_back(&seeds);
 
         tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> all_clusters =
             cluster_seeds(all_seeds, read_distance_limit, 0);
@@ -24,6 +25,16 @@ namespace vg {
     tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds (
                   const vector<vector<pos_t>>& all_seeds, int64_t read_distance_limit,
                   int64_t fragment_distance_limit) const {
+        //Wrapper for paired end
+        vector<const vector<pos_t>*> seed_pointers;
+        seed_pointers.reserve(all_seeds.size());
+        for (const vector<pos_t>& v : all_seeds) seed_pointers.push_back(&v);
+        return cluster_seeds(seed_pointers, read_distance_limit, fragment_distance_limit);
+    }
+
+    tuple<vector<SnarlSeedClusterer::cluster_group_t>,SnarlSeedClusterer::cluster_group_t> SnarlSeedClusterer::cluster_seeds (
+                  const vector<const vector<pos_t>*>& all_seeds, int64_t read_distance_limit,
+                  int64_t fragment_distance_limit) const {
         /* Given a vector of seeds and a limit, find a clustering of seeds where
          * seeds that are closer than the limit cluster together.
          * Returns a vector of cluster assignments
@@ -47,7 +58,7 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl;
         //for a single level of the snarl tree as it is being processed
         //It also keeps track of the parents of the current level
         size_t seed_count = 0;
-        for (auto& v : all_seeds) seed_count+= v.size();
+        for (auto v : all_seeds) seed_count+= v->size();
         TreeState tree_state (&all_seeds, read_distance_limit, fragment_distance_limit, seed_count);
 
 
@@ -74,7 +85,7 @@ cerr << endl << endl << endl << endl << "New cluster calculation:" << endl;
 #ifdef DEBUG_CLUSTER
 assert(tree_state.read_index_offsets[0] == 0);
 for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
-    assert (tree_state.read_index_offsets[i] + tree_state.all_seeds->at(i).size() == tree_state.read_index_offsets[i+1]);
+    assert (tree_state.read_index_offsets[i] + tree_state.all_seeds->at(i)->size() == tree_state.read_index_offsets[i+1]);
 }
 #endif
             //Cluster all the snarls at this depth
@@ -97,7 +108,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             for (auto group : tree_state.read_union_find[read_num].all_groups()){
                 cerr << "\t\t";
                 for (size_t c : group) {
-                   cerr << tree_state.all_seeds->at(read_num)[c] << " ";
+                   cerr << tree_state.all_seeds->at(read_num)->at(c) << " ";
                 }
                 cerr << endl;
             }
@@ -139,9 +150,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
         // Assign each seed to a node.
         hash_set<id_t> seen_nodes;
         for (size_t read_num = 0 ; read_num < tree_state.all_seeds->size() ; read_num++){ 
-            const vector<pos_t>& seeds = tree_state.all_seeds->at(read_num);
-            for (size_t i = 0; i < seeds.size(); i++) {
-                pos_t pos = seeds.at(i);
+            const vector<pos_t>* seeds = tree_state.all_seeds->at(read_num);
+            for (size_t i = 0; i < seeds->size(); i++) {
+                pos_t pos = seeds->at(i);
                 id_t id = get_id(pos);
 
                 //Assign the seed to a node
@@ -310,7 +321,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                         //And find the shortest distance from any seed to both
                         //ends of the node
 
-                        pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
+                        pos_t seed = tree_state.all_seeds->at(read_num)->at(iter->second);
                         int64_t dist_left = is_rev(seed) ? node_length- get_offset(seed) : get_offset(seed) + 1;
                         int64_t dist_right = is_rev(seed) ? get_offset(seed) + 1 : node_length - get_offset(seed);
 
@@ -360,9 +371,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                         pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
                         cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
                         bool has_seeds = false;
-                        for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                        for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                             if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                                cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                                cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                                 has_seeds = true;
                             }
                         }
@@ -402,7 +413,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 for (auto iter = seed_range_start; iter != tree_state.node_to_seeds[read_num].end() 
                                                     && iter->first == node_id; ++iter) {
                     //For each seed, find its offset
-                    pos_t seed = tree_state.all_seeds->at(read_num)[iter->second];
+                    pos_t seed = tree_state.all_seeds->at(read_num)->at(iter->second);
                     int64_t offset = is_rev(seed) ? node_length - get_offset(seed) : get_offset(seed) + 1;
 
                     node_clusters.fragment_best_left = min_not_minus_one(offset, node_clusters.fragment_best_left);
@@ -498,9 +509,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                     pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
                     cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
                     bool has_seeds = false;
-                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                         if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                            cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                            cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                             has_seeds = true;
                         }
                     }
@@ -620,8 +631,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                     //Union the two groups
                     tree_state.read_union_find[read_num].union_groups(combined_group, new_group);
                     //Find the new distances of the combined groups
-                    pair<int64_t, int64_t>& old_dists =
-                                           tree_state.read_cluster_dists[read_num][combined_group];
+                    pair<int64_t, int64_t>& old_dists = tree_state.read_cluster_dists[read_num][combined_group];
                     size_t new_combined_group = tree_state.read_union_find[read_num].find_group(new_group);
                     //Update which groups are being kept track of
                     if (new_combined_group != new_group) {
@@ -657,8 +667,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 //If these aren't in the same read cluster but are in
                 //the same fragment cluster
                 if (fragment_combined_group != -1) {
-                    tree_state.fragment_union_find.union_groups(fragment_combined_group,
-                                                        new_group + tree_state.read_index_offsets[read_num]);
+                    tree_state.fragment_union_find.union_groups(
+                                    fragment_combined_group, new_group + tree_state.read_index_offsets[read_num]);
                 }
                 fragment_combined_group = tree_state.fragment_union_find.find_group(
                                                      new_group + tree_state.read_index_offsets[read_num]);
@@ -694,8 +704,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
             //The clusters of the current snarl
             NodeClusters& snarl_clusters = kv.second.second;
 
-            MinimumDistanceIndex::SnarlIndex& snarl_index =
-                                         dist_index.snarl_indexes[curr_snarl_i];
+            MinimumDistanceIndex::SnarlIndex& snarl_index = dist_index.snarl_indexes[curr_snarl_i];
 
             //Get the lengths of the start and end nodes of the snarl, relative
             //to the order of the chain
@@ -738,10 +747,7 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
 
 
             //Distance from the start of chain to the start of the current snarl
-            int64_t add_dist_left = start_rank == 0 ? 0 :
-                                    chain_index.prefix_sum[start_rank] - 1;
-
-
+            int64_t add_dist_left = start_rank == 0 ? 0 : chain_index.prefix_sum[start_rank] - 1;
 
             //Combine snarl clusters that can be reached by looping
             int64_t loop_dist_end = chain_index.loop_fd[start_rank + 1] - 1 ;
@@ -759,9 +765,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                      << endl;
                 cerr << "\t\t";
                 bool has_seeds = false;
-                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                     if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                        cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                         has_seeds = true;
                     }
                 }
@@ -779,9 +785,9 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                 cerr << "\tleft: " << dists.first << " right : " << dists.second
                      << endl;
                 cerr << "\t\t";
-                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                     if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                        cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                     }
                 }
                 cerr << endl;
@@ -833,10 +839,8 @@ for (size_t i = 1 ; i < tree_state.all_seeds->size() ; i++) {
                                         ? -1
                                         : snarl_dists.first + loop_dist_start + snarl_length - start_length;
                     snarl_dists.second = min_not_minus_one(new_right, snarl_dists.second);
-                    snarl_clusters.fragment_best_right =
-                               min_not_minus_one(snarl_clusters.fragment_best_right, new_right);
-                    snarl_clusters.read_best_right[read_num] =
-                               min_not_minus_one(snarl_clusters.read_best_right[read_num], new_right);
+                    snarl_clusters.fragment_best_right = min_not_minus_one(snarl_clusters.fragment_best_right, new_right);
+                    snarl_clusters.read_best_right[read_num] = min_not_minus_one(snarl_clusters.read_best_right[read_num], new_right);
 #ifdef DEBUG_CLUSTER
 cerr << "  (Possibly) updating looping distance to right of snarl cluster " << read_num <<":" << cluster_head.second << ": "
      << new_right << " -> " << snarl_dists.second <<  endl;
@@ -868,8 +872,7 @@ cerr << "  Combining this cluster from the left " ;
                     if (snarl_dists.first == -1 || (new_left != -1 && new_left < snarl_dists.first)){
                         //If this is an improvement, update distances
                         snarl_dists.first = new_left;
-                        snarl_clusters.read_best_left[read_num] = 
-                                min_not_minus_one(new_left, snarl_clusters.read_best_left[read_num]);
+                        snarl_clusters.read_best_left[read_num] = min_not_minus_one(new_left, snarl_clusters.read_best_left[read_num]);
                         snarl_clusters.fragment_best_left = min_not_minus_one(new_left, snarl_clusters.fragment_best_left);
 
 #ifdef DEBUG_CLUSTER
@@ -891,8 +894,7 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                             snarl_clusters.fragment_best_right + snarl_dists.second + loop_dist_end - end_length - 1;
 
                         combine_snarl_clusters(cluster_head.second, snarl_cluster_right[read_num],
-                             fragment_snarl_cluster_right, to_erase,fragment_dist,
-                            read_dist, snarl_dists, read_num);
+                             fragment_snarl_cluster_right, to_erase,fragment_dist, read_dist, snarl_dists, read_num);
                     }
                 }
 
@@ -900,10 +902,13 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                 //existing chain clusters
                 int64_t read_dist = read_chain_right[read_num] == -1 || snarl_dists.first == -1 ? -1 :
                     snarl_dists.first + read_chain_right[read_num] - start_length-1;
+
                 int64_t fragment_dist = tree_state.fragment_distance_limit == 0 || fragment_chain_right == -1 || snarl_dists.first == -1 
                         ? -1 : snarl_dists.first+fragment_chain_right-start_length-1;
+
                 pair<int64_t, int64_t> new_snarl_dists (snarl_dists.first == -1 ? -1 : snarl_dists.first + add_dist_left,
                                                         snarl_dists.second);
+
                 bool combined_read = combine_chain_clusters (cluster_head.second,combined_cluster, fragment_combined_cluster,
                         combined_left, combined_right, new_snarl_dists, to_erase, fragment_dist, read_dist, read_num);
 
@@ -915,10 +920,8 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                                                 snarl_dists.second);
                     chain_clusters.fragment_best_left = min_not_minus_one(chain_clusters.fragment_best_left,d.first);
                     chain_clusters.fragment_best_right = min_not_minus_one(chain_clusters.fragment_best_right,d.second);
-                    chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num],
-                                                            d.first);
-                    chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num],
-                                                             d.second);
+                    chain_clusters.read_best_left[read_num] = min_not_minus_one(chain_clusters.read_best_left[read_num], d.first);
+                    chain_clusters.read_best_right[read_num] = min_not_minus_one(chain_clusters.read_best_right[read_num], d.second);
 
                     tree_state.read_cluster_dists[read_num][cluster_head.second] = std::move(d);
                 }
@@ -997,9 +1000,9 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                 cerr << "\t\tleft: " << dists.first << " right : " << dists.second << endl;
                 cerr << "\t\t\t";
                 bool has_seeds = false;
-                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                     if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                        cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                         has_seeds = true;
                     }
                 }
@@ -1046,11 +1049,9 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                 pair<int64_t, int64_t>& chain_dists = tree_state.read_cluster_dists[read_num][cluster_head.second];
 
                 if ((chain_dists.second != -1 && chain_clusters.read_best_left[read_num] != -1 &&
-                     chain_dists.second + chain_clusters.read_best_left[read_num] - first_length - 1
-                                                <= tree_state.read_distance_limit) ||
+                     chain_dists.second + chain_clusters.read_best_left[read_num] - first_length - 1 <= tree_state.read_distance_limit) ||
                    (chain_dists.first != -1 && chain_clusters.read_best_right[read_num] != -1 &&
-                      chain_dists.first + chain_clusters.read_best_right[read_num] - first_length - 1
-                                                <= tree_state.read_distance_limit)){
+                      chain_dists.first + chain_clusters.read_best_right[read_num] - first_length - 1 <= tree_state.read_distance_limit)){
                     //If this chain cluster is in the combined cluster
                     if (combined_cluster[read_num] == -1) {
                         combined_cluster[read_num] = cluster_head.second;
@@ -1076,11 +1077,9 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                     }
                 } else if (tree_state.fragment_distance_limit != 0 &&
                    ((chain_dists.second != -1 && chain_clusters.fragment_best_left != -1 &&
-                     chain_dists.second + chain_clusters.fragment_best_left - first_length - 1
-                                                <= tree_state.fragment_distance_limit) ||
+                     chain_dists.second + chain_clusters.fragment_best_left - first_length - 1 <= tree_state.fragment_distance_limit) ||
                    (chain_dists.first != -1 && chain_clusters.fragment_best_right != -1 &&
-                      chain_dists.first + chain_clusters.fragment_best_right - first_length - 1
-                                                <= tree_state.fragment_distance_limit))){
+                      chain_dists.first + chain_clusters.fragment_best_right - first_length - 1 <= tree_state.fragment_distance_limit))){
                     //If we can cluster by fragment
                     if (fragment_combined_cluster != -1) {
                         tree_state.fragment_union_find.union_groups(fragment_combined_cluster, cluster_head.second+tree_state.read_index_offsets[read_num]);
@@ -1116,9 +1115,9 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                     pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
                     cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
                     bool has_seeds = false;
-                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                         if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                            cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                            cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                             has_seeds = true;
                         }
                     }
@@ -1159,8 +1158,7 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                     TreeState& tree_state, size_t snarl_index_i) const {
         /*Get the clusters on this snarl.
          * Nodes have not yet been clustered */
-        MinimumDistanceIndex::SnarlIndex& snarl_index =
-                                        dist_index.snarl_indexes[snarl_index_i];
+        MinimumDistanceIndex::SnarlIndex& snarl_index = dist_index.snarl_indexes[snarl_index_i];
 #ifdef DEBUG_CLUSTER
         cerr << "Finding clusters on snarl number " << snarl_index_i
              << " headed by node " << snarl_index.id_in_parent << endl;
@@ -1169,8 +1167,7 @@ cerr << "  Maybe combining this cluster from the right" << endl;
         //Keep track of all clusters on this snarl
         NodeClusters snarl_clusters(tree_state.all_seeds->size());
 
-        auto combine_clusters = [&] (size_t& new_group, size_t& combined_group,
-                                    size_t& fragment_combined_group,
+        auto combine_clusters = [&] (size_t& new_group, size_t& combined_group, size_t& fragment_combined_group,
                                     int64_t fragment_dist, int64_t read_dist, size_t read_num){
             //Helper function to compare and combine clusters in two nodes of the same snarl
             //If the distance between two clusters is small enough, then combine them
@@ -1273,9 +1270,9 @@ cerr << "  Maybe combining this cluster from the right" << endl;
                 cerr << "\tdist left: " << tree_state.read_cluster_dists[c.first][c.second].first
                 << " dist right: " << tree_state.read_cluster_dists[c.first][c.second].second << endl;
                 cerr << "\t\t";
-                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                     if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                        cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                        cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                     }
                 }
                 cerr << endl;
@@ -1304,10 +1301,8 @@ cerr << "\tcluster: " << c_i << "dists to ends in snarl" << snarl_index.id_in_pa
      << " : " << new_dists.first << " " << new_dists.second << endl;
 #endif
 
-                snarl_clusters.fragment_best_left =min_not_minus_one(
-                                   snarl_clusters.fragment_best_left,new_dists.first);
-                snarl_clusters.fragment_best_right = min_not_minus_one(
-                                   snarl_clusters.fragment_best_right, new_dists.second);
+                snarl_clusters.fragment_best_left =min_not_minus_one( snarl_clusters.fragment_best_left,new_dists.first);
+                snarl_clusters.fragment_best_right = min_not_minus_one(snarl_clusters.fragment_best_right, new_dists.second);
                 snarl_clusters.read_best_left[child_cluster_head.first] =min_not_minus_one(
                                    snarl_clusters.read_best_left[child_cluster_head.first], new_dists.first);
                 snarl_clusters.read_best_right[child_cluster_head.first] = min_not_minus_one(
@@ -1463,8 +1458,7 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                             int64_t fragment_dist = dist_r_r + curr_child_clusters.fragment_best_right + dists_k.second-1;
                             int64_t read_dist = curr_child_clusters.read_best_right[read_num] == -1 ? -1 :
                                dist_r_r + curr_child_clusters.read_best_right[read_num] + dists_k.second-1;
-                            combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r,
-                               fragment_dist, read_dist, read_num);
+                            combine_clusters(k_group, group_r_r[read_num], fragment_group_r_r, fragment_dist, read_dist, read_num);
                         }
                     }
                 }
@@ -1488,9 +1482,9 @@ cerr << "\t distances between ranks " << node_rank << " and " << other_rank
                     pair<int64_t, int64_t> dists = tree_state.read_cluster_dists[c.first][c.second];
                     cerr << "\t" << c.first << ":"<<c.second << ": left: " << dists.first << " right : " << dists.second << ": ";
                     bool has_seeds = false;
-                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first).size() ; x++) {
+                    for (size_t x = 0 ; x < tree_state.all_seeds->at(c.first)->size() ; x++) {
                         if (tree_state.read_union_find[c.first].find_group(x) == c.second) {
-                            cerr << tree_state.all_seeds->at(c.first)[x] << " ";
+                            cerr << tree_state.all_seeds->at(c.first)->at(x) << " ";
                             has_seeds = true;
                         }
                     }
diff --git a/src/seed_clusterer.hpp b/src/seed_clusterer.hpp
index fd07b696368..6971080a965 100644
--- a/src/seed_clusterer.hpp
+++ b/src/seed_clusterer.hpp
@@ -38,6 +38,11 @@ class SnarlSeedClusterer {
 
     private:
 
+        //Actual clustering function that takes a vector of pointers to seeds
+        tuple<vector<cluster_group_t>, cluster_group_t> cluster_seeds ( 
+                const vector<const vector<pos_t>*>& all_seeds,
+                int64_t read_distance_limit, int64_t fragment_distance_limit=0) const;
+
         MinimumDistanceIndex& dist_index;
 
         enum ChildNodeType {CHAIN, SNARL, NODE};
@@ -135,7 +140,7 @@ class SnarlSeedClusterer {
             //is updated to know about its children
 
             //Vector of all the seeds for each read
-            const vector<vector<pos_t>>* all_seeds; 
+            const vector<const vector<pos_t>*>* all_seeds; 
 
             //prefix sum vector of the number of seeds per read
             //To get the index of a seed for the fragment clusters
@@ -191,7 +196,7 @@ class SnarlSeedClusterer {
 
             //Constructor takes in a pointer to the seeds, the distance limits, and 
             //the total number of seeds in all_seeds
-            TreeState (const vector<vector<pos_t>>* all_seeds, int64_t read_distance_limit, 
+            TreeState (const vector<const vector<pos_t>*>* all_seeds, int64_t read_distance_limit, 
                        int64_t fragment_distance_limit, size_t seed_count) :
                 all_seeds(all_seeds),
                 read_distance_limit(read_distance_limit),
@@ -200,13 +205,13 @@ class SnarlSeedClusterer {
                 read_index_offsets(1,0){
 
                 for (size_t i = 0 ; i < all_seeds->size() ; i++) {
-                    const vector<pos_t>& v = all_seeds->at(i);
-                    size_t offset = read_index_offsets.back() + v.size();
+                    size_t size = all_seeds->at(i)->size();
+                    size_t offset = read_index_offsets.back() + size;
                     read_index_offsets.push_back(offset);
-                    read_cluster_dists.emplace_back(v.size(), make_pair(-1,-1));
+                    read_cluster_dists.emplace_back(size, make_pair(-1,-1));
                     node_to_seeds.emplace_back();
-                    node_to_seeds.back().reserve(v.size());
-                    read_union_find.emplace_back(v.size(), false);
+                    node_to_seeds.back().reserve(size);
+                    read_union_find.emplace_back(size, false);
                 }
             }
         };
diff --git a/src/subcommand/gaffe_main.cpp b/src/subcommand/gaffe_main.cpp
index fd882eaedba..3de30c6b183 100644
--- a/src/subcommand/gaffe_main.cpp
+++ b/src/subcommand/gaffe_main.cpp
@@ -358,9 +358,9 @@ int main_gaffe(int argc, char** argv) {
     // How many mappings per read can we emit?
     Range<size_t> max_multimaps = 1;
     // How many clusters should we extend?
-    Range<size_t> max_extensions = 150;
+    Range<size_t> max_extensions = 300;
     // How many extended clusters should we align, max?
-    Range<size_t> max_alignments = 4;
+    Range<size_t> max_alignments = 6;
     //Throw away cluster with scores that are this amount below the best
     Range<double> cluster_score = 50;
     //Throw away clusters with coverage this amount below the best 
diff --git a/src/unittest/seed_clusterer.cpp b/src/unittest/seed_clusterer.cpp
index 8c4332d9d10..38e031dd75e 100644
--- a/src/unittest/seed_clusterer.cpp
+++ b/src/unittest/seed_clusterer.cpp
@@ -868,7 +868,7 @@ namespace unittest {
     */
     TEST_CASE("Random graphs", "[cluster]"){
 
-        for (int i = 0; i < 1000; i++) {
+        for (int i = 0; i < 0; i++) {
             // For each random graph
             VG graph;
             random_graph(1000, 20, 100, &graph);

From efb8eed3a8fca8c1078ea750fd206dd3d66f8a55 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 13 Nov 2019 15:53:46 -0500
Subject: [PATCH 48/79] ratchet down vcf allele serach space

---
 src/snarl_caller.cpp     | 6 ++++++
 src/snarl_caller.hpp     | 2 +-
 src/traversal_finder.hpp | 4 ++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 0d307fc19a2..c7567a90439 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -496,6 +496,9 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
         if (skips.count(best_allele)) {
             continue;
         }
+        if (support_val(supports[best_allele]) < min_total_support_for_call) {
+            break;
+        }
 
         if (ploidy == 1) {
             candidates.insert({best_allele});
@@ -524,6 +527,9 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
             size_t sec_count = 0;
             for (int j = 0; j < ranked_secondary_traversals.size() && sec_count < top_k; ++j) {
                 int second_best_allele = ranked_secondary_traversals[j];
+                if (support_val(secondary_supports[second_best_allele]) < min_total_support_for_call) {
+                    break;
+                }
                 if (!skips.count(second_best_allele) && second_best_allele != best_allele) {
                     // canonical ordering for our set
                     candidates.insert({min(best_allele, second_best_allele), max(best_allele, second_best_allele)});
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index d88c2bcd4d5..1be11053721 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -112,7 +112,7 @@ class SupportBasedSnarlCaller : public SnarlCaller {
     size_t min_site_depth = 3;
     /// used only for pruning alleles in the VCFTraversalFinder:  minimum support
     /// of an allele's alt-path for it to be considered in the brute-force enumeration
-    double min_alt_path_support = 0.2;
+    double min_alt_path_support = 0.5;
 };
 
 
diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp
index 573c17155c9..f85e602c775 100644
--- a/src/traversal_finder.hpp
+++ b/src/traversal_finder.hpp
@@ -428,7 +428,7 @@ class VCFTraversalFinder : public TraversalFinder {
     size_t max_traversal_cutoff;
 
     /// Maximum number of pruning iterations
-    size_t max_prune_iterations = 1000;
+    size_t max_prune_iterations = 2;
     
     /// Include snarl endpoints in traversals
     bool include_endpoints = true;
@@ -453,7 +453,7 @@ class VCFTraversalFinder : public TraversalFinder {
                        FastaReference* fasta_ref = nullptr,
                        FastaReference* ins_ref = nullptr,
                        function<bool(const SnarlTraversal&, int)> skip_alt = nullptr,
-                       size_t max_traversal_cutoff = 500000);
+                       size_t max_traversal_cutoff = 50000);
         
     virtual ~VCFTraversalFinder();
     

From 8f978c12d2a7253c8faf82cd42b626b7f739b5f2 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 13 Nov 2019 22:41:17 -0500
Subject: [PATCH 49/79] better vcf output buffering

---
 src/graph_caller.cpp | 15 +++++++++------
 src/graph_caller.hpp |  4 ++--
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index 22622e475e0..141ac63bede 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -45,6 +45,7 @@ void GraphCaller::call_top_level_snarls(bool recurse_on_fail) {
 }
 
 VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name) {
+    output_variants.resize(get_thread_count());
 }
 
 VCFOutputCaller::~VCFOutputCaller() {
@@ -74,17 +75,19 @@ string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector<st
 }
 
 void VCFOutputCaller::add_variant(vcflib::Variant& var) const {
-#pragma omp critical(add_variant)
-    {
-        output_variants.push_back(var);
-    }
+    output_variants[omp_get_thread_num()].push_back(var);
 }
 
 void VCFOutputCaller::write_variants(ostream& out_stream) const {
-    std::sort(output_variants.begin(), output_variants.end(), [](const vcflib::Variant& v1, const vcflib::Variant& v2) {
+    vector<vcflib::Variant> all_variants;
+    for (const auto& buf : output_variants) {
+        all_variants.reserve(all_variants.size() + buf.size());
+        std::move(buf.begin(), buf.end(), std::back_inserter(all_variants));
+    }
+    std::sort(all_variants.begin(), all_variants.end(), [](const vcflib::Variant& v1, const vcflib::Variant& v2) {
             return v1.sequenceName < v2.sequenceName || (v1.sequenceName == v2.sequenceName && v1.position < v2.position);
         });
-    for (auto v : output_variants) {
+    for (auto v : all_variants) {
         v.setVariantCallFile(output_vcf);
         out_stream << v << endl;
     }
diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp
index 2f4d8c38bbb..5845be1f666 100644
--- a/src/graph_caller.hpp
+++ b/src/graph_caller.hpp
@@ -74,8 +74,8 @@ class VCFOutputCaller {
     /// Sample name
     string sample_name;
 
-    /// output buffer (for sorting)
-    mutable vector<vcflib::Variant> output_variants;
+    /// output buffers (1/thread) (for sorting)
+    mutable vector<vector<vcflib::Variant>> output_variants;
 };
     
 /**

From 2bf760d4e1174aed1e9e140bf09e68db5c8318d3 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 14 Nov 2019 09:14:35 -0500
Subject: [PATCH 50/79] use caching in traversal support finder

---
 src/subcommand/call_main.cpp |  4 +--
 src/traversal_support.cpp    | 65 +++++++++++++++++++++++++++++++++++-
 src/traversal_support.hpp    | 28 ++++++++++++++++
 3 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index baae2060516..c282b1122c1 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -265,8 +265,8 @@ int main_call(int argc, char** argv) {
         // Load our packed supports (they must have come from vg pack on graph)
         packer = unique_ptr<Packer>(new Packer(graph));
         packer->load_from_file(pack_filename);
-        // Make a packed traversal support finder
-        PackedTraversalSupportFinder* packed_support_finder = new PackedTraversalSupportFinder(*packer, *snarl_manager);
+        // Make a packed traversal support finder (using cached veresion important for poisson caller)
+        PackedTraversalSupportFinder* packed_support_finder = new CachedPackedTraversalSupportFinder(*packer, *snarl_manager);
         support_finder = unique_ptr<TraversalSupportFinder>(packed_support_finder);
         
         SupportBasedSnarlCaller* packed_caller = nullptr;
diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp
index 2bcb478b7c6..b561bdefd56 100644
--- a/src/traversal_support.cpp
+++ b/src/traversal_support.cpp
@@ -305,7 +305,7 @@ Support PackedTraversalSupportFinder::get_edge_support(const edge_t& edge) const
 }
 
 Support PackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse,
-                                                   id_t to, bool to_reverse) const {
+                                                       id_t to, bool to_reverse) const {
     Edge proto_edge;
     proto_edge.set_from(from);
     proto_edge.set_from_start(from_reverse);
@@ -345,4 +345,67 @@ Support PackedTraversalSupportFinder::get_avg_node_support(id_t node) const {
 }
 
 
+CachedPackedTraversalSupportFinder::CachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size) :
+    PackedTraversalSupportFinder(packer, snarl_manager) {
+    size_t num_threads = get_thread_count();
+    min_node_support_cache.resize(num_threads);
+    avg_node_support_cache.resize(num_threads);
+    edge_support_cache.resize(num_threads);
+    for (size_t i = 0; i < num_threads; ++i) {
+        min_node_support_cache[i] = new LRUCache<nid_t, Support>(cache_size);
+        avg_node_support_cache[i] = new LRUCache<nid_t, Support>(cache_size);
+        edge_support_cache[i] = new LRUCache<edge_t, Support>(cache_size);
+    }
+}
+
+CachedPackedTraversalSupportFinder::~CachedPackedTraversalSupportFinder() {
+    for (size_t i = 0; i < min_node_support_cache.size(); ++i) {
+        delete min_node_support_cache[i];
+        delete avg_node_support_cache[i];
+        delete edge_support_cache[i];
+    }
+}
+
+Support CachedPackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse,
+                                                             id_t to, bool to_reverse) const {
+    const HandleGraph* graph = packer.get_graph();
+    edge_t edge = graph->edge_handle(graph->get_handle(from, from_reverse),
+                                     graph->get_handle(to, to_reverse));
+    
+    auto& support_cache = *edge_support_cache[omp_get_thread_num()];
+    pair<Support, bool> cached = support_cache.retrieve(edge);
+    if (cached.second == true) {
+        return cached.first;
+    } else {
+        Support support = PackedTraversalSupportFinder::get_edge_support(from, from_reverse, to, to_reverse);
+        support_cache.put(edge, support);
+        return support;
+    }
+}
+
+Support CachedPackedTraversalSupportFinder::get_min_node_support(id_t node) const {
+    auto& support_cache = *min_node_support_cache[omp_get_thread_num()];
+    pair<Support, bool> cached = support_cache.retrieve(node);
+    if (cached.second == true) {
+        return cached.first;
+    } else {
+        Support support = PackedTraversalSupportFinder::get_min_node_support(node);
+        support_cache.put(node, support);
+        return support;
+    }
+}
+
+Support CachedPackedTraversalSupportFinder::get_avg_node_support(id_t node) const {
+    auto& support_cache = *avg_node_support_cache[omp_get_thread_num()];
+    pair<Support, bool> cached = support_cache.retrieve(node);
+    if (cached.second == true) {
+        return cached.first;
+    } else {
+        Support support = PackedTraversalSupportFinder::get_avg_node_support(node);
+        support_cache.put(node, support);
+        return support;
+    }
+}
+
+
 }
diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp
index 42fd10607bb..c26c7899559 100644
--- a/src/traversal_support.hpp
+++ b/src/traversal_support.hpp
@@ -119,6 +119,34 @@ class PackedTraversalSupportFinder : public TraversalSupportFinder {
     const Packer& packer;
 };
 
+/**
+ * Add a caching overlay to the PackedTravesalSupportFinder to avoid frequent
+ * base queries which can become expensive.  Even caching the edges seems
+ * to have an impact
+ */
+class CachedPackedTraversalSupportFinder : public PackedTraversalSupportFinder {
+public:
+    CachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size = 100000);
+    virtual ~CachedPackedTraversalSupportFinder();
+
+    /// Support of an edge
+    virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const;
+    
+    /// Minimum support of a node
+    virtual Support get_min_node_support(id_t node) const;
+
+    /// Average support of a node
+    virtual Support get_avg_node_support(id_t node) const;
+    
+protected:
+
+    /// One node cache per threade
+    mutable vector<LRUCache<edge_t, Support>*> edge_support_cache;
+    mutable vector<LRUCache<nid_t, Support>*> min_node_support_cache;
+    mutable vector<LRUCache<nid_t, Support>*> avg_node_support_cache;
+};
+
+
 }
 
 #endif

From b7dc1bbe870c058c9d41e56878d37e39e088fd9b Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 14 Nov 2019 10:20:32 -0500
Subject: [PATCH 51/79] remove last critical section in caller

---
 src/graph_caller.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index 141ac63bede..f96de89b2fc 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -15,17 +15,16 @@ GraphCaller::~GraphCaller() {
 void GraphCaller::call_top_level_snarls(bool recurse_on_fail) {
 
     // Used to recurse on children of parents that can't be called
-    vector<const Snarl*> snarl_queue;
+    size_t thread_count = get_thread_count();
+    vector<vector<const Snarl*>> snarl_queue(thread_count);
 
     // Run the snarl caller on a snarl, and queue up the children if it fails
     auto process_snarl = [&](const Snarl* snarl) {
         bool was_called = call_snarl(*snarl);
         if (!was_called && recurse_on_fail) {
             const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
-#pragma omp critical (snarl_queue)
-            {
-                snarl_queue.insert(snarl_queue.end(), children.begin(), children.end());
-            }
+            vector<const Snarl*>& thread_queue = snarl_queue[omp_get_thread_num()];
+            thread_queue.insert(thread_queue.end(), children.begin(), children.end());
         }
     };
 
@@ -33,9 +32,14 @@ void GraphCaller::call_top_level_snarls(bool recurse_on_fail) {
     snarl_manager.for_each_top_level_snarl_parallel(process_snarl);
 
     // Then recurse on any children the snarl caller failed to handle
-    while (!snarl_queue.empty()) {
+    while (!std::all_of(snarl_queue.begin(), snarl_queue.end(),
+                        [](const vector<const Snarl*>& snarl_vec) {return snarl_vec.empty();})) {
         vector<const Snarl*> cur_queue;
-        std::swap(snarl_queue, cur_queue);
+        for (vector<const Snarl*>& thread_queue : snarl_queue) {
+            cur_queue.reserve(cur_queue.size() + thread_queue.size());
+            std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue));
+            thread_queue.clear();
+        }
 #pragma omp parallel for
         for (int i = 0; i < cur_queue.size(); ++i) {
             process_snarl(cur_queue[i]);

From 98ae8bdf0584f2b73ef45beaa1b821994a21ac43 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Thu, 14 Nov 2019 13:31:14 -0800
Subject: [PATCH 52/79] Clear out old cmake stuff to better handle GNU->clang
 compiler change

---
 Makefile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index a99320eb884..4ae906710d1 100644
--- a/Makefile
+++ b/Makefile
@@ -499,9 +499,11 @@ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc
 # We moved the Dynamic headers so make sure to clean up the old ones.
 $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/*.hpp $(DYNAMIC_DIR)/include/internal/*.hpp
 	rm -Rf $(INC_DIR)/dynamic.hpp $(INC_DIR)/dynamic
-	mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/* $(INC_DIR)/dynamic
 	# annoyingly doesn't have an install option on the cmake, so we manually move their external dependency headers
-	cd $(CWD)/$(DYNAMIC_DIR) && mkdir -p build && cd build && cmake .. && make && cp -r hopscotch_map-prefix/src/hopscotch_map/include/* $(CWD)/$(INC_DIR)/dynamic
+	cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && cmake .. && make && cp -r hopscotch_map-prefix/src/hopscotch_map/include/* $(CWD)/$(INC_DIR)/
+	# Do the copy of the main file last so we can tell if this recipe failed and redo it.
+	# Otherwise we get dynamic.hpp without its deps
+	mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/* $(INC_DIR)/dynamic
 
 $(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) 
 	+. ./source_me.sh && cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="-L/opt/local/lib" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install
@@ -617,7 +619,7 @@ $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB
 	. ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/
 
 $(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/src/*.cpp $(LIBBDSG_DIR)/include/bdsg/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp
-	+. ./source_me.sh  && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && pwd && cp -r include/bdsg $(CWD)/$(INC_DIR)
+	+. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && pwd && cp -r include/bdsg $(CWD)/$(INC_DIR)
 
 $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp
 $(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp

From 5ad96cdc3d4a9986087c8a3c1cf71f8d7d4fbd38 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 14 Nov 2019 16:41:20 -0500
Subject: [PATCH 53/79] more constraints for traversal enumeratrion

---
 src/snarl_caller.cpp      | 18 +++++++++++-------
 src/snarl_caller.hpp      |  8 +++++++-
 src/traversal_support.cpp |  3 ++-
 src/traversal_support.hpp |  6 ++++--
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index c7567a90439..3a1754ceb4b 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -305,7 +305,7 @@ void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
         shared_travs.push_back(genotype[0]);
     }
     // compute the support of our called alleles
-    vector<Support> allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0);
+    vector<Support> allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0);
     
     // Compute the total support for all the alts that will be appearing
     Support total_support = std::accumulate(allele_supports.begin(), allele_supports.end(), Support());    
@@ -481,6 +481,9 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     // sort the traversals by support
     vector<int> ranked_traversals = rank_by_support(supports);
     size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size());
+    size_T max_sec_trav = std::min(top_m, (size_t)ranked_traversals.size());
+    // take the top-m traversals in order to check against the top traversal
+    set<int> top_traversals(ranked_traversals.begin(), ranked_traversals.begin() + max_sec_trav);
 
     // the candidate genotypes and their supports.  the numbers here are alleles as indexed in traversals[]
     set<vector<int>> candidates;
@@ -507,7 +510,7 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
         
             // we prune out traversals whose exclusive support (structure that is not shared with best traversal)
             // doesn't meet a certain cutoff
-            vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, true, false, false, ref_trav_idx);
+            vector<Support> secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, top_traversals, true, false, false, ref_trav_idx);
             for (int j = 0; j < secondary_exclusive_supports.size(); ++j) {
                 if (j != best_allele &&
                     support_val(secondary_exclusive_supports[j]) < min_total_support_for_call &&
@@ -517,7 +520,7 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
             }
 
             // get the supports of each traversal in light of best
-            vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, false, false, false, ref_trav_idx);
+            vector<Support> secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, top_traversals, false, false, false, ref_trav_idx);
             vector<int> ranked_secondary_traversals = rank_by_support(secondary_supports);
 
             // add the homozygous genotype for our best allele
@@ -551,7 +554,7 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     double best_genotype_likelihood = -numeric_limits<double>::max();
     vector<int> best_genotype;
     for (const auto& candidate : candidates) {
-        double gl = genotype_likelihood(candidate, traversals, ref_trav_idx, exp_depth, depth_err);
+        double gl = genotype_likelihood(candidate, traversals, top_traversals, ref_trav_idx, exp_depth, depth_err);
         if (gl > best_genotype_likelihood) {
             best_genotype_likelihood = gl;
             best_genotype = candidate;
@@ -566,12 +569,13 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
 
 double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotype,
                                                       const vector<SnarlTraversal>& traversals,
+                                                      const set<int>& trav_subset, 
                                                       int ref_trav_idx, double exp_depth, double depth_err) {
     
     assert(genotype.size() == 1 || genotype.size() == 2);
 
     // get the genotype support
-    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, ref_trav_idx);
+    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, trav_subset, ref_trav_idx);
 
     // get the total support over the site
     Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support());    
@@ -645,7 +649,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     assert(traversals.size() == variant.alleles.size());
 
     // get the genotype support
-    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, 0);
+    vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0);
 
     // Get the depth of the site
     Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support());    
@@ -690,7 +694,7 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     // assume ploidy 2
     for (int i = 0; i < traversals.size(); ++i) {
         for (int j = i; j < traversals.size(); ++j) {
-            double gl = genotype_likelihood({i, j}, traversals, 0, exp_depth, depth_err);
+            double gl = genotype_likelihood({i, j}, traversals, {}, 0, exp_depth, depth_err);
             gen_likelihoods.push_back(gl);
             if (vector<int>({i, j}) == genotype || vector<int>({j,i}) == genotype) {
                 gen_likelihood = gl;
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index 1be11053721..1ec2ce9ac24 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -214,8 +214,11 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
     /// P[allele1] * P[allle2] * P[uncalled alleles]
     /// Homozygous alleles are split into two, with half support each
     /// The (natural) logoarithm is returned
+    /// If trav_subset is not empty, traversals outside that set (and genotype)
+    /// will be ignored to save time
     double genotype_likelihood(const vector<int>& genotype,
                                const vector<SnarlTraversal>& traversals,
+                               const set<int>& trav_subset,
                                int ref_trav_idx, double exp_depth, double depth_err);
 
     /// Rank supports
@@ -225,7 +228,10 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
     double baseline_mapping_error = 0.005;
 
     /// Consider up to the top-k traversals (based on support) for genotyping
-    size_t top_k = 25;
+    size_t top_k = 20;
+    /// Consider up to the tom-m secondary traversals (based on support) for each top traversal
+    /// (so at most top_k * top_m considered)
+    size_t top_m = 100;
     
     /// Map path name to <mean, std_err> of depth coverage from the packer
     const algorithms::BinnedDepthIndex& depth_index;
diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp
index b561bdefd56..156733f7d92 100644
--- a/src/traversal_support.cpp
+++ b/src/traversal_support.cpp
@@ -65,13 +65,14 @@ Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& trav
 
 vector<Support> TraversalSupportFinder::get_traversal_genotype_support(const vector<SnarlTraversal>& traversals,
                                                                        const vector<int>& genotype,
+                                                                       const set<int>& other_trav_subset,
                                                                        int ref_trav_idx) {
     set<int> tgt_trav_set(genotype.begin(), genotype.end());
     vector<int> tgt_travs(tgt_trav_set.begin(), tgt_trav_set.end());
     // get the support of just the alleles in the genotype, evenly splitting shared stuff
     vector<Support> allele_support = get_traversal_set_support(traversals, tgt_travs, tgt_trav_set, false, false, true, ref_trav_idx);
     // get the support of everythin else, treating stuff in the genotype alleles as 0
-    vector<Support> other_support = get_traversal_set_support(traversals, tgt_travs, {}, false, true, false, ref_trav_idx);
+    vector<Support> other_support = get_traversal_set_support(traversals, tgt_travs, other_trav_subset, false, true, false, ref_trav_idx);
     // combine the above two vectors
     for (int allele : tgt_travs) {
         other_support[allele] = allele_support[allele];
diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp
index c26c7899559..de1704559d5 100644
--- a/src/traversal_support.hpp
+++ b/src/traversal_support.hpp
@@ -50,9 +50,11 @@ class TraversalSupportFinder {
     /// some alleles in a genotype, where everything is split evently among them
     /// anything not in the genotype gets a support using "exclusive_count"
     /// where nodes taken by the genotype are counted as 0
+    /// stuff not in the genotype is limited to other_trav_subset (or all if empty)
     virtual vector<Support> get_traversal_genotype_support(const vector<SnarlTraversal>& traversals,
-                                                         const vector<int>& genotype,
-                                                         int ref_trav_idx = -1);
+                                                           const vector<int>& genotype,
+                                                           const set<int>& other_trav_subset,
+                                                           int ref_trav_idx = -1);
     
     /// traversals:      get support for each traversal in this set
     /// shared_travs:    if a node appears N times in shared_travs, then it will count as 1 / (N+1) support

From 04c7536c2d10caeeb191b9a433b0a38c9e63de7a Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 14 Nov 2019 16:56:38 -0500
Subject: [PATCH 54/79] typo

---
 src/snarl_caller.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 3a1754ceb4b..46a589cec09 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -481,7 +481,7 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     // sort the traversals by support
     vector<int> ranked_traversals = rank_by_support(supports);
     size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size());
-    size_T max_sec_trav = std::min(top_m, (size_t)ranked_traversals.size());
+    size_t max_sec_trav = std::min(top_m, (size_t)ranked_traversals.size());
     // take the top-m traversals in order to check against the top traversal
     set<int> top_traversals(ranked_traversals.begin(), ranked_traversals.begin() + max_sec_trav);
 

From c3752cdcb0ab83005edaeb217695d7c6690e00f6 Mon Sep 17 00:00:00 2001
From: Jerven bolleman <jerven.bolleman@sib.swiss>
Date: Thu, 7 Nov 2019 20:19:11 +0100
Subject: [PATCH 55/79] The VG ontology was missing position which we are using
 for a while already. And also a domain was wrong.

---
 ontology/vg.html | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 ontology/vg.ttl  | 10 +++++++++-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/ontology/vg.html b/ontology/vg.html
index cd2449e6754..23e4a11ac1d 100644
--- a/ontology/vg.html
+++ b/ontology/vg.html
@@ -144,13 +144,13 @@
 </tr>
 <tr><td><a href="#Properties">
                                     Properties (
-                                    7
+                                    8
                                     )
                             </a></td></tr>
 <tr>
 <td><a style="objectPropertySearch" href="#ObjectProperties">
                                     Object properties (
-                                    6
+                                    7
                                     )
                             </a></td>
 <td><select onChange="window.location.hash = document.getElementById('navi3').value" id="navi3"><option value="linksForwardToForward">
@@ -163,6 +163,8 @@
 				vg:linksReverseToReverse</option>
 <option value="node">
 				vg:node</option>
+<option value="position">
+				vg:position</option>
 <option value="reverseOfNode">
 				vg:reverseOfNode</option></select></td>
 </tr>
@@ -829,9 +831,51 @@ <h3 id="ObjectProperties">Object properties</h3>
 <tr>
 <td><a href="http://www.w3.org/2000/01/rdf-schema#range">
 					rdfs:range</a></td>
+<td><a href="http://biohackathon.org/resource/vg#Node">
+				vg:Node</a></td>
+</tr>
+</tbody>
+</table>
+<table class="subsection">
+<a name="http://biohackathon.org/resource/vg#position"></a><tbody>
+<tr><th id="position">
+				vg:position<span class="cp-type">
+								(rdf:type
+								<a href="http://www.w3.org/2002/07/owl#ObjectProperty">
+					owl:ObjectProperty</a>
+								)
+							</span>
+</th></tr>
+<tr>
+<td><a href="http://www.w3.org/2000/01/rdf-schema#comment">
+					rdfs:comment</a></td>
+<td>
+						"This is the position on the reference path at which this step starts."
+						<sup><a href=".">
+				xsd:string</a></sup>
+</td>
+</tr>
+<tr>
+<td><a href="http://www.w3.org/2000/01/rdf-schema#domain">
+					rdfs:domain</a></td>
 <td><a href="http://biohackathon.org/resource/vg#Step">
 				vg:Step</a></td>
 </tr>
+<tr>
+<td><a href="http://www.w3.org/2000/01/rdf-schema#label">
+					rdfs:label</a></td>
+<td>
+						"position"
+						<sup><a href=".">
+				xsd:string</a></sup>
+</td>
+</tr>
+<tr>
+<td><a href="http://www.w3.org/2000/01/rdf-schema#range">
+					rdfs:range</a></td>
+<td><a href="http://www.w3.org/2001/XMLSchema#positiveInteger">
+				xsd:positiveInteger</a></td>
+</tr>
 </tbody>
 </table>
 <table class="subsection">
diff --git a/ontology/vg.ttl b/ontology/vg.ttl
index ffca973c4dd..95bc7e6e975 100644
--- a/ontology/vg.ttl
+++ b/ontology/vg.ttl
@@ -71,8 +71,16 @@
   rdfs:comment "This means that this step occurs on the forward strand of the sequence attaced to the node (i.e. it is on the explicit encoded forward (5' to 3') strand) of the predicate node."^^xsd:string ;
   rdfs:domain :Step ;
   rdfs:label "node"^^xsd:string ;
-  rdfs:range :Step ;
+  rdfs:range :Node ;
 .
+
+:position
+  rdf:type owl:ObjectProperty ;
+  rdfs:comment "This is the position on the reference path at which this step starts."^^xsd:string ;
+  rdfs:domain :Step ;
+  rdfs:label "position"^^xsd:string ;
+  rdfs:range xsd:positiveInteger .
+
 :rank
   rdf:type owl:DatatypeProperty ;
   rdfs:comment "The rank records the step place along its path."^^xsd:string ;

From c5a5fa4b1501f3dcab28a31baa8f1ee9feb525b9 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 18 Nov 2019 16:08:23 -0500
Subject: [PATCH 56/79] pad depth coverage check and just use baseline error

---
 src/snarl_caller.cpp | 19 ++++++++++++++++---
 src/snarl_caller.hpp |  3 +++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 46a589cec09..41fb0d4eb59 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -599,7 +599,12 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector<int>& genotyp
     }
     
     // how many reads would we expect to not map to our genotype due to error
-    double error_rate = std::min(0.05, depth_err + baseline_mapping_error);
+    // Note: The bin size is set quite a bit smaller than originally intended as it seems to
+    // help nearly nevery benchmark.  But the small bin sizes means that depth_err, the
+    // error from the binned coverage, is way too high and including it only causes trouble.
+    // tldr: just use the baseline_mapping_error constant and forget about depth_err for now. 
+    //double error_rate = std::min(0.05, depth_err + baseline_mapping_error);
+    double error_rate = baseline_mapping_error;
     double other_poisson_lambda = error_rate * exp_depth; //support_val(total_site_support);
 
     // and our likelihood for the unmapped reads we see:
@@ -648,6 +653,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
 
     assert(traversals.size() == variant.alleles.size());
 
+    // get the traversal sizes
+    vector<int> traversal_sizes = support_finder.get_traversal_sizes(traversals);
+
     // get the genotype support
     vector<Support> genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0);
 
@@ -684,8 +692,13 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
     double gen_likelihood;    
     variant.format.push_back("GL");
 
-    // expected depth from our coverage
-    pair<size_t, size_t> ref_range = make_pair(variant.position, variant.position + variant.ref.length());
+    // expected depth from our coverage.  we look at the reference-range from the snarl plus a bit of padding,
+    // averaging over every depth bin this touches.  todo: adaptively compute nearby coverage without bins
+    // (requires VCFGenotyper to be refactored to require a PathPositionIndex)
+    size_t longest_traversal = *max_element(traversal_sizes.begin(), traversal_sizes.end());
+    size_t padding = (depth_padding_factor * longest_traversal) / 2;
+    pair<size_t, size_t> ref_range = make_pair(max((long)0, (long)(variant.position - padding)),
+					       variant.position + variant.ref.length() + padding);
     auto depth_info = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first, ref_range.second);
     double exp_depth = depth_info.first;
     double depth_err = depth_info.second;
diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp
index 1ec2ce9ac24..b2319c9197c 100644
--- a/src/snarl_caller.hpp
+++ b/src/snarl_caller.hpp
@@ -232,6 +232,9 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller {
     /// Consider up to the tom-m secondary traversals (based on support) for each top traversal
     /// (so at most top_k * top_m considered)
     size_t top_m = 100;
+
+    /// padding to apply wrt to longest traversal to snarl ranges when looking up binned depth
+    double depth_padding_factor = 1.;
     
     /// Map path name to <mean, std_err> of depth coverage from the packer
     const algorithms::BinnedDepthIndex& depth_index;

From 64428e2af14dace4bc118fc4cfb127a99166bae8 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 19 Nov 2019 16:58:02 -0500
Subject: [PATCH 57/79] allow 1b bin and its nan variance

---
 src/algorithms/coverage_depth.cpp |  4 ++--
 src/snarl_caller.cpp              | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index 7429a3b6e0d..3e39894c980 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -90,7 +90,7 @@ pair<double, double> packed_depth_of_bin(const Packer& packer,
             }
         }
     }
-    return wellford_mean_var(bin_length, mean, M2, true);
+    return wellford_mean_var(bin_length, mean, M2);
 }
 
 vector<tuple<size_t, size_t, double, double>> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size,
@@ -225,7 +225,7 @@ static pair<double, double> combine_and_average_node_coverages(const HandleGraph
         }
     }
 
-    return wellford_mean_var(count, mean, M2, count < graph.get_node_count());
+    return wellford_mean_var(count, mean, M2);
 }
 
 
diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp
index 41fb0d4eb59..26333773822 100644
--- a/src/snarl_caller.cpp
+++ b/src/snarl_caller.cpp
@@ -547,8 +547,9 @@ vector<int> PoissonSupportSnarlCaller::genotype(const Snarl& snarl,
     // expected depth from our coverage
     auto depth_info = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first, ref_range.second);
     double exp_depth = depth_info.first;
-    double depth_err = depth_info.second;
-    assert(!isnan(exp_depth) && !isnan(depth_err));
+    assert(!isnan(exp_depth));
+    // variance/std-err can be nan when binsize < 2.  We just clamp it to 0
+    double depth_err = depth_info.second ? !isnan(depth_info.second) : 0.;
 
     // genotype (log) likelihoods
     double best_genotype_likelihood = -numeric_limits<double>::max();
@@ -701,8 +702,9 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl,
 					       variant.position + variant.ref.length() + padding);
     auto depth_info = algorithms::get_depth_from_index(depth_index, variant.sequenceName, ref_range.first, ref_range.second);
     double exp_depth = depth_info.first;
-    double depth_err = depth_info.second;
-    assert(!isnan(exp_depth) && !isnan(depth_err));
+    assert(!isnan(exp_depth));
+    // variance/std-err can be nan when binsize < 2.  We just clamp it to 0
+    double depth_err = depth_info.second ? !isnan(depth_info.second) : 0.;
 
     // assume ploidy 2
     for (int i = 0; i < traversals.size(); ++i) {

From dd9113afb32dc8658fb744227ed6957b3f400f96 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 20 Nov 2019 10:44:07 -0500
Subject: [PATCH 58/79] Cut softclips by default in vg augment.  Disable with
 -S instead of enabling with -C

---
 README.md                       |  7 ++++---
 src/subcommand/augment_main.cpp | 13 ++++++++-----
 test/t/04_vg_align.t            |  4 ++--
 test/t/17_vg_augment.t          | 14 +++++++-------
 test/t/18_vg_call.t             |  2 +-
 5 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 028a68ae0e7..f2156bab557 100644
--- a/README.md
+++ b/README.md
@@ -224,11 +224,12 @@ Variation from alignments can be embedded back into the graph.  This process is
 
 ```sh
 # augment the graph with all variation from the GAM except that implied by soft clips, saving to aug.vg.  aug.gam contains the same reads as aln.gam but mapped to aug.vg
-vg augment x.vg aln.gam -C -A aug.gam > aug.vg
+vg augment x.vg aln.gam -A aug.gam > aug.vg
 
 # augment the graph with all variation from the GAM, saving each mapping as a path in the graph.
+# softclips of alignment paths are preserved (`-S`).
 # Note, this can be much less efficient than the above example if there are many alignments in the GAM
-vg augment x.vg aln.gam -i > aug_with_paths.vg
+vg augment x.vg aln.gam -i -S > aug_with_paths.vg
 ```
 
 ### Variant Calling
@@ -247,7 +248,7 @@ vg pack -x x.xg -g aln.gam -Q 5 -o aln.pack
 vg call x.xg -k aln.pack > graph_calls.vcf
 ```
 
-In order to also consider *novel* variants from the reads, use the augmented graph and gam (as created in the previous example using `vg augment -C -A`):
+In order to also consider *novel* variants from the reads, use the augmented graph and gam (as created in the previous example using `vg augment -A`):
 
 ```sh
 # Index our augmented graph
diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp
index 994a65c4050..c694628e22c 100644
--- a/src/subcommand/augment_main.cpp
+++ b/src/subcommand/augment_main.cpp
@@ -44,7 +44,7 @@ void help_augment(char** argv, ConfigurableParser& parser) {
          << endl
          << "general options:" << endl
          << "    -i, --include-paths         merge the paths implied by alignments into the graph" << endl
-         << "    -C, --cut-softclips         drop softclips from the paths (recommended)" << endl
+         << "    -S, --keep-softclips        include softclips from input alignments (they are cut by default)" << endl
          << "    -B, --label-paths           don't augment with alignments, just use them for labeling the graph" << endl
          << "    -Z, --translation FILE      save translations from augmented back to base graph to FILE" << endl
          << "    -A, --alignment-out FILE    save augmented GAM reads to FILE" << endl
@@ -74,7 +74,7 @@ int main_augment(int argc, char** argv) {
     bool include_paths = false;
 
     // Include the softclips for each path
-    bool include_softclips = true;
+    bool include_softclips = false;
 
     // Just label the paths with the GAM
     bool label_paths = false;
@@ -121,7 +121,7 @@ int main_augment(int argc, char** argv) {
         {"translation", required_argument, 0, 'Z'},
         {"alignment-out", required_argument, 0, 'A'},
         {"include-paths", no_argument, 0, 'i'},
-        {"cut-softclips", no_argument, 0, 'C'},
+        {"keep-softclips", no_argument, 0, 'S'},
         {"label-paths", no_argument, 0, 'B'},
         {"subgraph", no_argument, 0, 's'},
         {"min-coverage", required_argument, 0, 'm'},
@@ -137,7 +137,7 @@ int main_augment(int argc, char** argv) {
         {"include-gt", required_argument, 0, 'L'},
         {0, 0, 0, 0}
     };
-    static const char* short_options = "a:Z:A:iCBhpvt:l:L:sm:c:q:Q:";
+    static const char* short_options = "a:Z:A:iCSBhpvt:l:L:sm:c:q:Q:";
     optind = 2; // force optind past command positional arguments
 
     // This is our command-line parser
@@ -160,7 +160,10 @@ int main_augment(int argc, char** argv) {
             include_paths = true;
             break;
         case 'C':
-            include_softclips = false;
+            cerr << "[vg augment] warning: -C / --cut-softclips option is deprecated (now enabled by default)" << endl;
+            break;
+        case 'S':
+            include_softclips = true;
             break;
         case 'B':
             label_paths = true;
diff --git a/test/t/04_vg_align.t b/test/t/04_vg_align.t
index cb78999eca4..d78525815fd 100644
--- a/test/t/04_vg_align.t
+++ b/test/t/04_vg_align.t
@@ -37,8 +37,8 @@ is $(vg align -js GGCTATGTCTGAACTAGGAGGGTAGAAAGAATATTCATTTTGGTTGCCACAAACCATCGAAA
 
 vg construct -m 1000 -r tiny/tiny.fa >t.vg
 seq=CAAATAAGGCTTGGAAATGTTCTGGAGTTCTATTATATTCCAACTCTCTT
-vg align -s $seq t.vg | vg augment t.vg - -i  >t2.vg
-is $(vg align -s $seq -Q query t2.vg | vg augment t2.vg - -i -B | vg view - | grep "query" | cut -f 3 | grep -o "[0-9]\+" | wc -l) 4 "align can use query names and outputs GAM"
+vg align -s $seq t.vg | vg augment t.vg - -i -S >t2.vg
+is $(vg align -s $seq -Q query t2.vg | vg augment t2.vg - -i -B -S | vg view - | grep "query" | cut -f 3 | grep -o "[0-9]\+" | wc -l) 4 "align can use query names and outputs GAM"
 rm t.vg t2.vg
 
 
diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t
index ed66c2dc910..99a52fe7977 100644
--- a/test/t/17_vg_augment.t
+++ b/test/t/17_vg_augment.t
@@ -61,8 +61,8 @@ rm -rf t.idx.xg t.idx.gcsa read_aug.gam
 
 vg construct -v tiny/tiny.vcf.gz -r tiny/tiny.fa >t.vg
 vg align -s GGGGGGGAAATTTTCTGGAGTTCTATTATATTCCAAAAAAAAAA t.vg >t.gam
-is $(vg augment -i t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i t.vg t.gam | vg stats  -H - | awk '{ print $3}') | cut -f 3) GGGGG "a soft clip at read start becomes a new head of the graph"
-is $(vg augment -i t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i t.vg t.gam | vg stats  -T - | awk '{ print $3}') | cut -f 3) AAAAAAAA "a soft clip at read end becomes a new tail of the graph"
+is $(vg augment -i -S t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i -S t.vg t.gam | vg stats  -H - | awk '{ print $3}') | cut -f 3) GGGGG "a soft clip at read start becomes a new head of the graph"
+is $(vg augment -i -S t.vg t.gam | vg view - | grep ^S | grep $(vg augment -i -S t.vg t.gam | vg stats  -T - | awk '{ print $3}') | cut -f 3) AAAAAAAA "a soft clip at read end becomes a new tail of the graph"
 vg align -s AAATTTTCTGGAGTTCTAT t.vg >> t.gam
 vg find -x t.vg -n 9 -c 1 > n9.vg
 vg augment n9.vg t.gam -s -A n9_aug.gam > /dev/null
@@ -72,7 +72,7 @@ rm -rf t.vg t.gam n9.vg n9_aug.gam
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg
 vg index -x x.xg -g x.gcsa -k 16 x.vg
 vg map -x x.xg -g x.gcsa -G small/x-s1337-n100-e0.01-i0.005.gam -t 1 >x.gam
-vg augment -Z x.trans -i x.vg x.gam >x.mod.vg
+vg augment -Z x.trans -i -S x.vg x.gam >x.mod.vg
 is $(vg view -Z x.trans | wc -l) 1288 "the expected graph translation is exported when the graph is edited"
 rm -rf x.vg x.xg x.gcsa x.reads x.gam x.mod.vg x.trans
 
@@ -82,17 +82,17 @@ vg index -x 2snp.xg 2snp.vg
 vg sim -l 30 -x 2snp.xg -n 30 -a >2snp.sim
 vg index -x flat.xg -g flat.gcsa -k 16 flat.vg
 vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam
-is $(vg augment flat.vg 2snp.gam -i | vg mod -D - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes"
+is $(vg augment flat.vg 2snp.gam -i -S | vg mod -D - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes"
 
 vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTATCTGGAGTTCTATTATATCCCAACTCTCTG/' | vg view -Fv - >2err.vg
 vg sim -l 30 -x 2err.vg -n 10 -a >2err.sim
 vg map -g flat.gcsa -x flat.xg -G 2err.sim -k 8 >2err.gam
 cat 2snp.gam 2err.gam > 4edits.gam
-vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort >  2snp_default.nodes
-vg augment flat.vg 2snp.gam -m 1 | vg view - | grep S | awk '{print $3}' | sort >  2snp_m1.nodes
+vg augment flat.vg 2snp.gam -S | vg view - | grep S | awk '{print $3}' | sort >  2snp_default.nodes
+vg augment flat.vg 2snp.gam -m 1 -S | vg view - | grep S | awk '{print $3}' | sort >  2snp_m1.nodes
 diff 2snp_default.nodes 2snp_m1.nodes
 is "$?" 0 "augmenting 2 snps with -m 1 produces the same nodes as default"
-vg augment flat.vg 4edits.gam -m 11 | vg view - | grep S | awk '{print $3}' | sort > 4edits_m11.nodes
+vg augment flat.vg 4edits.gam -m 11 -S | vg view - | grep S | awk '{print $3}' | sort > 4edits_m11.nodes
 diff 2snp_default.nodes 4edits_m11.nodes
 is "$?" 0 "augmenting 2 snps and 2 errors with -m 11 produces the same nodes as with just the snps"
 
diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t
index a15baac7044..b58a699f738 100644
--- a/test/t/18_vg_call.t
+++ b/test/t/18_vg_call.t
@@ -104,6 +104,6 @@ vg augment c.vg m.gam -A m.aug.gam >c.aug.vg
 vg index -x c.aug.xg c.aug.vg
 vg pack -x c.aug.xg -g m.aug.gam -o m.aug.pack
 vg call c.aug.xg -k m.aug.pack >m.vcf
-is $(cat m.vcf | grep -v "^#" | wc -l) 3 "vg call finds true homozygous variants in a cyclic graph"
+is $(cat m.vcf | grep -v "^#" | wc -l) 4 "vg call finds true homozygous variants in a cyclic graph"
 rm -f c.vg c.xg c.gcsa c.gcsa.lcp m.fa m.vg m.xg m.sim m.gam m.aug.gam c.aug.vg c.aug.xg m.aug.pack m.vcf
 

From 01e0aac083a909d9e5bd2a2350b8ebc788be1e47 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 20 Nov 2019 13:25:01 -0500
Subject: [PATCH 59/79] deprecate instead of remove -C

---
 src/subcommand/augment_main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp
index c694628e22c..a9cc8effe58 100644
--- a/src/subcommand/augment_main.cpp
+++ b/src/subcommand/augment_main.cpp
@@ -121,6 +121,7 @@ int main_augment(int argc, char** argv) {
         {"translation", required_argument, 0, 'Z'},
         {"alignment-out", required_argument, 0, 'A'},
         {"include-paths", no_argument, 0, 'i'},
+        {"cut-softclips", no_argument, 0, 'C'},
         {"keep-softclips", no_argument, 0, 'S'},
         {"label-paths", no_argument, 0, 'B'},
         {"subgraph", no_argument, 0, 's'},

From fe1b439fc7a77f270cfe065afeb5669136f9ad5f Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 21 Nov 2019 12:00:09 -0500
Subject: [PATCH 60/79] handlify vg_set

---
 deps/libbdsg                    |  2 +-
 deps/libhandlegraph             |  2 +-
 deps/xg                         |  2 +-
 src/io/save_handle_graph.hpp    | 64 ++++++++++++++++++++++++
 src/subcommand/augment_main.cpp | 17 ++-----
 src/subcommand/convert_main.cpp | 15 ++----
 src/subcommand/ids_main.cpp     | 37 ++++++++++----
 src/vg_set.cpp                  | 88 +++++++++++++++------------------
 src/vg_set.hpp                  |  9 ++--
 9 files changed, 146 insertions(+), 90 deletions(-)
 create mode 100644 src/io/save_handle_graph.hpp

diff --git a/deps/libbdsg b/deps/libbdsg
index d69763eca9f..6c57975dc96 160000
--- a/deps/libbdsg
+++ b/deps/libbdsg
@@ -1 +1 @@
-Subproject commit d69763eca9fe796bdeb5abd050a585934a8b6407
+Subproject commit 6c57975dc969403b6cd8ae0017315b176812a793
diff --git a/deps/libhandlegraph b/deps/libhandlegraph
index 541b97315fd..729d2c86805 160000
--- a/deps/libhandlegraph
+++ b/deps/libhandlegraph
@@ -1 +1 @@
-Subproject commit 541b97315fd413846f5a76476907f8d2b2276242
+Subproject commit 729d2c868053d2e2cbe89f9ecf46ee641235ed52
diff --git a/deps/xg b/deps/xg
index fb89754ecde..e3ee79f0550 160000
--- a/deps/xg
+++ b/deps/xg
@@ -1 +1 @@
-Subproject commit fb89754ecde62ddfd4758e4e37004839daeade78
+Subproject commit e3ee79f055083a298f7d04a5fb0d56dd34967b7c
diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp
new file mode 100644
index 00000000000..9085ac5d3c1
--- /dev/null
+++ b/src/io/save_handle_graph.hpp
@@ -0,0 +1,64 @@
+#ifndef VG_IO_SAVE_HANDLE_GRAPH_IO_HPP_INCLUDED
+#define VG_IO_REGISTER_LIBVG_IO_HPP_INCLUDED
+
+/**
+ * \save_handle_graph.hpp
+ * Use vpkg to serialize a HandleGraph object
+ */
+
+#include "bdsg/packed_graph.hpp"
+#include "bdsg/hash_graph.hpp"
+#include "bdsg/odgi.hpp"
+#include "vg.hpp"
+#include "xg.hpp"
+#include <vg/io/stream.hpp>
+#include <vg/io/vpkg.hpp>
+
+namespace vg {
+
+namespace io {
+
+using namespace std;
+
+
+/**
+ * Save a handle graph using the VPKG::save() function. 
+ * Todo: should this be somewhere else (ie in vgio with new types registered?)
+ */
+inline void save_handle_graph(HandleGraph* graph, ostream& os) {
+    if (dynamic_cast<VG*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<VG*>(graph), os);
+    } else if (dynamic_cast<bdsg::HashGraph*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<bdsg::HashGraph*>(graph), os);
+    } else if (dynamic_cast<bdsg::PackedGraph*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<bdsg::PackedGraph*>(graph), os);
+    } else if (dynamic_cast<bdsg::ODGI*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<bdsg::ODGI*>(graph), os);
+    } else if (dynamic_cast<xg::XG*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<xg::XG*>(graph), os);
+    } else {
+        throw runtime_error("Internal error: unable to serialize graph");
+    }
+}
+
+inline void save_handle_graph(HandleGraph* graph, const string& dest_path) {
+    if (dynamic_cast<VG*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<VG*>(graph), dest_path);
+    } else if (dynamic_cast<bdsg::HashGraph*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<bdsg::HashGraph*>(graph), dest_path);
+    } else if (dynamic_cast<bdsg::PackedGraph*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<bdsg::PackedGraph*>(graph), dest_path);
+    } else if (dynamic_cast<bdsg::ODGI*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<bdsg::ODGI*>(graph), dest_path);
+    } else if (dynamic_cast<xg::XG*>(graph) != nullptr) {
+        vg::io::VPKG::save(*dynamic_cast<xg::XG*>(graph), dest_path);
+    } else {
+        throw runtime_error("Internal error: unable to serialize graph");
+    }    
+}
+    
+}
+
+}
+
+#endif
diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp
index a9cc8effe58..5bf9918b90c 100644
--- a/src/subcommand/augment_main.cpp
+++ b/src/subcommand/augment_main.cpp
@@ -26,6 +26,7 @@
 #include "../vg.hpp"
 #include "../augment.hpp"
 #include "../packer.hpp"
+#include "../io/save_handle_graph.hpp"
 #include <vg/io/stream.hpp>
 #include <vg/io/vpkg.hpp>
 #include <handlegraph/mutable_path_mutable_handle_graph.hpp>
@@ -34,6 +35,7 @@
 #include "bdsg/odgi.hpp"
 #include <bdsg/overlay_helper.hpp>
 
+
 using namespace std;
 using namespace vg;
 using namespace vg::subcommand;
@@ -388,19 +390,8 @@ int main_augment(int argc, char** argv) {
         }
     } 
 
-    // Serialize the graph using VPKG.  Todo: is there away to do this in one line?
-    // could just call serialie() directly if willing to forego vpkg...
-    if (vg_graph != nullptr) {
-        vg::io::VPKG::save(*vg_graph, cout);
-    } else if (dynamic_cast<bdsg::HashGraph*>(graph.get()) != nullptr) {
-        vg::io::VPKG::save(*dynamic_cast<bdsg::HashGraph*>(graph.get()), cout);
-    } else if (dynamic_cast<bdsg::PackedGraph*>(graph.get()) != nullptr) {
-        vg::io::VPKG::save(*dynamic_cast<bdsg::PackedGraph*>(graph.get()), cout);
-    } else if (dynamic_cast<bdsg::ODGI*>(graph.get()) != nullptr) {
-        vg::io::VPKG::save(*dynamic_cast<bdsg::ODGI*>(graph.get()), cout);
-    } else {
-        throw runtime_error("Internal error: vg augment cannot output this graph format");
-    }
+    // Serialize the graph using VPKG.
+    vg::io::save_handle_graph(graph.get(), cout);
     
     return 0;
 }
diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp
index cd3fa4f476d..688b41a9a2f 100644
--- a/src/subcommand/convert_main.cpp
+++ b/src/subcommand/convert_main.cpp
@@ -3,6 +3,7 @@
 #include "../utility.hpp"
 #include "xg.hpp"
 #include "../convert_handle.hpp"
+#include "../io/save_handle_graph.hpp"
 #include <vg/io/stream.hpp>
 #include <vg/io/vpkg.hpp>
 
@@ -130,18 +131,8 @@ int main_convert(int argc, char** argv) {
         convert_handle_graph(input_graph.get(), mutable_output_graph);
     }
 
-    // Serialize the graph using VPKG.  Todo: is there away to do this in one line?
-    if (output_format == "vg") {
-        vg::io::VPKG::save(*dynamic_cast<VG*>(output_graph.get()), cout);
-    } else if (output_format == "hash") {
-        vg::io::VPKG::save(*dynamic_cast<bdsg::HashGraph*>(output_graph.get()), cout);
-    } else if (output_format == "packed") {
-        vg::io::VPKG::save(*dynamic_cast<bdsg::PackedGraph*>(output_graph.get()), cout);
-    } else if (output_format == "xg") {
-        vg::io::VPKG::save(*dynamic_cast<xg::XG*>(output_graph.get()), cout);
-    } else if (output_format == "odgi") {
-        vg::io::VPKG::save(*dynamic_cast<bdsg::ODGI*>(output_graph.get()), cout);
-    }
+    // Serialize the graph using VPKG.
+    vg::io::save_handle_graph(output_graph.get(), cout);
 
     return 0;
 }
diff --git a/src/subcommand/ids_main.cpp b/src/subcommand/ids_main.cpp
index 2b7c831f857..fa42a64670e 100644
--- a/src/subcommand/ids_main.cpp
+++ b/src/subcommand/ids_main.cpp
@@ -15,7 +15,14 @@
 #include "../vg.hpp"
 #include "../vg_set.hpp"
 #include "../algorithms/topological_sort.hpp"
-
+#include <vg/io/stream.hpp>
+#include <vg/io/vpkg.hpp>
+#include <handlegraph/mutable_path_mutable_handle_graph.hpp>
+#include "bdsg/packed_graph.hpp"
+#include "bdsg/hash_graph.hpp"
+#include "bdsg/odgi.hpp"
+#include <bdsg/overlay_helper.hpp>
+#include "../io/save_handle_graph.hpp"
 #include <gcsa/support.h>
 
 using namespace std;
@@ -110,19 +117,30 @@ int main_ids(int argc, char** argv) {
     }
 
     if (!join && mapping_name.empty()) {
-        VG* graph;
+        unique_ptr<MutablePathMutableHandleGraph> graph;
         get_input_file(optind, argc, argv, [&](istream& in) {
-            graph = new VG(in);
-        });
+                graph = vg::io::VPKG::load_one<MutablePathMutableHandleGraph>(in);
+            });
 
         if (sort) {
             // Set up the nodes so we go through them in topological order
-            graph->sort();
+            graph->apply_ordering(algorithms::topological_order(graph.get()), true);
         }
 
-        if (compact || sort) {
+        if (compact && !sort) {
             // Compact only, or compact to re-assign IDs after sort
-            graph->compact_ids();
+            VG* vg_graph = dynamic_cast<VG*>(graph.get());
+            if (vg_graph != nullptr) {
+                vg_graph->compact_ids();
+            } else {
+                // try to use thie compact-option from apply_ordering
+                vector<handle_t> graph_ordering(graph->get_node_count());
+                size_t i = 0;
+                graph->for_each_handle([&](handle_t handle) {
+                        graph_ordering[i++] = handle;
+                    });
+                graph->apply_ordering(graph_ordering, true);
+            }
         }
 
         if (increment != 0) {
@@ -130,11 +148,10 @@ int main_ids(int argc, char** argv) {
         }
 
         if (decrement != 0) {
-            graph->decrement_node_ids(decrement);
+            graph->increment_node_ids(-increment);
         }
 
-        graph->serialize_to_ostream(std::cout);
-        delete graph;
+        vg::io::save_handle_graph(graph.get(), cout);
     } else {
 
         vector<string> graph_file_names;
diff --git a/src/vg_set.cpp b/src/vg_set.cpp
index f3e467a952a..7a60b06bd25 100644
--- a/src/vg_set.cpp
+++ b/src/vg_set.cpp
@@ -1,72 +1,62 @@
 #include "vg_set.hpp"
 #include <vg/io/stream.hpp>
 #include "source_sink_overlay.hpp"
+#include <vg/io/stream.hpp>
+#include <vg/io/vpkg.hpp>
+#include "io/save_handle_graph.hpp"
 
 namespace vg {
-// sets of VGs on disk
+// sets of MutablePathMutableHandleGraphs on disk
 
-void VGset::transform(std::function<void(VG*)> lambda) {
+void VGset::transform(std::function<void(MutableHandleGraph*)> lambda) {
     for (auto& name : filenames) {
         // load
-        VG* g = NULL;
-        if (name == "-") {
-            g = new VG(std::cin, show_progress & progress_bars);
-        } else {
-            ifstream in(name.c_str());
-            if (!in) throw ifstream::failure("failed to open " + name);
-            g = new VG(in, show_progress & progress_bars);
-            in.close();
+        unique_ptr<MutableHandleGraph> g;
+        get_input_file(name, [&](istream& in) {
+            // Note: I would have liked to just load a MutableHandleGraph here but the resulting pointer
+            // is broken (tested: VG and PackedGraph)
+            g = vg::io::VPKG::load_one<MutablePathMutableHandleGraph>(in);
+            });
+        // legacy:
+        VG* vg_g = dynamic_cast<VG*>(g.get());
+        if (vg_g != nullptr) {
+            vg_g->name = name;
         }
-        g->name = name;
         // apply
-        lambda(g);
+        lambda(g.get());
         // write to the same file
-        ofstream out(name.c_str());
-        g->serialize_to_ostream(out);
-        out.close();
-        delete g;
+        vg::io::save_handle_graph(g.get(), name);
     }
 }
 
-void VGset::for_each(std::function<void(VG*)> lambda) {
+void VGset::for_each(std::function<void(HandleGraph*)> lambda) {
     for (auto& name : filenames) {
         // load
-        VG* g = NULL;
-        if (name == "-") {
-            g = new VG(std::cin, show_progress & progress_bars);
-        } else {
-            ifstream in(name.c_str());
-            if (!in) throw ifstream::failure("failed to open " + name);
-            g = new VG(in, show_progress & progress_bars);
-            in.close();
-        }
-        g->name = name;
+        unique_ptr<HandleGraph> g;
+        get_input_file(name, [&](istream& in) {
+                g = vg::io::VPKG::load_one<HandleGraph>(in);
+            });
+        // legacy:
+        VG* vg_g = dynamic_cast<VG*>(g.get());
+        if (vg_g != nullptr) {
+            vg_g->name = name;
+        }        
         // apply
-        lambda(g);
-        delete g;
-    }
-}
-
-void VGset::for_each_graph_chunk(std::function<void(Graph&)> lamda) {
-    for (auto& name : filenames) {
-        ifstream in(name.c_str());
-        vg::io::for_each(in, lamda);
+        lambda(g.get());
     }
 }
 
 id_t VGset::max_node_id(void) {
     id_t max_id = 0;
-    for_each_graph_chunk([&](const Graph& graph) {
-            for (size_t i = 0; i < graph.node_size(); ++i) {
-                max_id = max(graph.node(i).id(), max_id);
-            }
+    for_each([&](HandleGraph* graph) {
+            max_id = max(graph->max_node_id(), max_id);
         });
     return max_id;
 }
 
 int64_t VGset::merge_id_space(void) {
     int64_t max_node_id = 0;
-    auto lambda = [&max_node_id](VG* g) {
+    auto lambda = [&max_node_id](MutableHandleGraph* g) {
         if (max_node_id > 0) g->increment_node_ids(max_node_id);
         max_node_id = g->max_node_id();
     };
@@ -183,9 +173,13 @@ void VGset::to_xg(xg::XG& index, const function<bool(const string&)>& paths_to_t
 }
 
 void VGset::for_each_kmer_parallel(size_t kmer_size, const function<void(const kmer_t&)>& lambda) {
-    for_each([&lambda, kmer_size, this](VG* g) {
-        g->show_progress = show_progress & progress_bars;
-        g->preload_progress("processing kmers of " + g->name);
+    for_each([&lambda, kmer_size, this](HandleGraph* g) {
+            // legacy
+            VG* vg_g = dynamic_cast<VG*>(g);
+            if (vg_g != nullptr) {
+                vg_g->show_progress = show_progress & progress_bars;
+                vg_g->preload_progress("processing kmers of " + vg_g->name);
+            }
         //g->for_each_kmer_parallel(kmer_size, path_only, edge_max, lambda, stride, allow_dups, allow_negatives);
         for_each_kmer(*g, kmer_size, lambda);
     });
@@ -209,7 +203,7 @@ void VGset::write_gcsa_kmers_ascii(ostream& out, int kmer_size,
         cout << kp << endl;
     };
 
-    for_each([&](VG* g) {
+    for_each([&](HandleGraph* g) {
         // Make an overlay for each graph, without modifying it. Break into tip-less cycle components.
         // Make sure to use a consistent head and tail ID across all graphs in the set.
         SourceSinkOverlay overlay(g, kmer_size, head_id, tail_id);
@@ -234,7 +228,7 @@ void VGset::write_gcsa_kmers_binary(ostream& out, int kmer_size, size_t& size_li
     }
 
     size_t total_size = 0;
-    for_each([&](VG* g) {
+    for_each([&](HandleGraph* g) {
         // Make an overlay for each graph, without modifying it. Break into tip-less cycle components.
         // Make sure to use a consistent head and tail ID across all graphs in the set.
         SourceSinkOverlay overlay(g, kmer_size, head_id, tail_id);
@@ -262,7 +256,7 @@ vector<string> VGset::write_gcsa_kmers_binary(int kmer_size, size_t& size_limit,
 
     vector<string> tmpnames;
     size_t total_size = 0;
-    for_each([&](VG* g) {
+    for_each([&](HandleGraph* g) {
         // Make an overlay for each graph, without modifying it. Break into tip-less cycle components.
         // Make sure to use a consistent head and tail ID across all graphs in the set.
         SourceSinkOverlay overlay(g, kmer_size, head_id, tail_id);
diff --git a/src/vg_set.hpp b/src/vg_set.hpp
index 8f8b4cce746..a3d069b9d20 100644
--- a/src/vg_set.hpp
+++ b/src/vg_set.hpp
@@ -6,7 +6,7 @@
 #include <functional>
 #include <stdlib.h>
 #include <gcsa/gcsa.h>
-#include "vg.hpp"
+#include "handle.hpp"
 #include "index.hpp"
 #include "xg.hpp"
 #include "kmer.hpp"
@@ -14,7 +14,7 @@
 
 namespace vg {
 
-// for dealing with collections of VGs on disk
+// for dealing with collections of HandleGraphs on disk
 class VGset {
 public:
 
@@ -26,9 +26,8 @@ class VGset {
         : filenames(files)
         { };
 
-    void transform(std::function<void(VG*)> lambda);
-    void for_each(std::function<void(VG*)> lambda);
-    void for_each_graph_chunk(std::function<void(Graph&)> lamda);
+    void transform(std::function<void(MutableHandleGraph*)> lambda);
+    void for_each(std::function<void(HandleGraph*)> lambda);
 
     /// Stream through the files and determine the max node id
     id_t max_node_id(void);

From 88e355ab1f82a03e5c4105e4169c32922f67a819 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 21 Nov 2019 12:09:07 -0500
Subject: [PATCH 61/79] only increment ids as-needed in vg ids -j

---
 src/vg_set.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/vg_set.cpp b/src/vg_set.cpp
index 7a60b06bd25..7ad99c668ff 100644
--- a/src/vg_set.cpp
+++ b/src/vg_set.cpp
@@ -57,7 +57,10 @@ id_t VGset::max_node_id(void) {
 int64_t VGset::merge_id_space(void) {
     int64_t max_node_id = 0;
     auto lambda = [&max_node_id](MutableHandleGraph* g) {
-        if (max_node_id > 0) g->increment_node_ids(max_node_id);
+        int64_t delta = max_node_id - g->min_node_id();
+        if (delta >= 0) {
+            g->increment_node_ids(delta + 1);
+        }
         max_node_id = g->max_node_id();
     };
     transform(lambda);

From d75981034d00051736bf16bd9bcb4e3fb21c4b19 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 22 Nov 2019 09:17:27 -0500
Subject: [PATCH 62/79] more vg chunk handlification

---
 src/chunker.cpp               | 116 ++++++++++++++++++----------------
 src/chunker.hpp               |   4 +-
 src/io/save_handle_graph.hpp  |  21 +++++-
 src/subcommand/chunk_main.cpp |  49 +++++++++++---
 test/t/30_vg_chunk.t          |   5 +-
 5 files changed, 127 insertions(+), 68 deletions(-)

diff --git a/src/chunker.cpp b/src/chunker.cpp
index 73066b34b41..6bea18f6087 100644
--- a/src/chunker.cpp
+++ b/src/chunker.cpp
@@ -3,6 +3,7 @@
 #include <vg/io/stream.hpp>
 #include "chunker.hpp"
 #include "algorithms/subgraph.hpp"
+#include "convert_handle.hpp"
 
 //#define debug
 
@@ -19,7 +20,16 @@ PathChunker::~PathChunker() {
 }
 
 void PathChunker::extract_subgraph(const Region& region, int context, int length, bool forward_only,
-                                   VG& subgraph, Region& out_region) {
+                                   MutablePathMutableHandleGraph& subgraph, Region& out_region) {
+    // This method still depends on VG
+    // (not a super high priority to port, as calling can now be done at genome scale and we no longer
+    // have to chunk up paths)
+    VG* vg_subgraph = dynamic_cast<VG*>(&subgraph);
+    if (vg_subgraph == nullptr) {
+        vg_subgraph = new VG();
+        assert(subgraph.get_node_count() == 0);
+    }
+    
     // extract our path range into the graph
     path_handle_t path_handle = graph->get_path_handle(region.seq);
     step_handle_t start_step = graph->get_step_at_position(path_handle, region.start);
@@ -42,28 +52,28 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length
         if (graph->get_is_reverse(step_handle)) {
             step_handle = graph->flip(step_handle);
         }
-        if (!subgraph.has_node(graph->get_id(step_handle))) {
-            subgraph.create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle));
+        if (!vg_subgraph->has_node(graph->get_id(step_handle))) {
+            vg_subgraph->create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle));
         }
     };
     // expand the context and get path information
     // if forward_only true, then we only go forward.
     if (context > 0) {
-        algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only);
+        algorithms::expand_subgraph_by_steps(*graph, *vg_subgraph, context, forward_only);
     }
     if (length > 0) {
-        algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only);
+        algorithms::expand_subgraph_by_length(*graph, *vg_subgraph, context, forward_only);
     }
     else if (context == 0 && length == 0) {
-        algorithms::add_connecting_edges_to_subgraph(*graph, subgraph);
+        algorithms::add_connecting_edges_to_subgraph(*graph, *vg_subgraph);
     }
-    algorithms::add_subpaths_to_subgraph(*graph, subgraph);
+    algorithms::add_subpaths_to_subgraph(*graph, *vg_subgraph);
         
     // build the vg of the subgraph
-    subgraph.remove_orphan_edges();
+    vg_subgraph->remove_orphan_edges();
 
     // get our range endpoints before context expansion
-    list<mapping_t>& mappings = subgraph.paths.get_path(region.seq);
+    list<mapping_t>& mappings = vg_subgraph->paths.get_path(region.seq);
     assert(!mappings.empty());
     size_t mappings_size = mappings.size();
     int64_t input_start_node = graph->get_id(start_handle);
@@ -126,13 +136,13 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length
             for (; prev_it != mappings.begin(); --prev_it) {
                 cur_it = prev_it;
                 --cur_it;
-                handle_t  prev_handle = subgraph.get_handle(prev_it->node_id(),
+                handle_t  prev_handle = vg_subgraph->get_handle(prev_it->node_id(),
                                                             prev_it->is_reverse());
-                handle_t cur_handle = subgraph.get_handle(cur_it->node_id(),
+                handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(),
                                                           cur_it->is_reverse());
-                edge_t edge = subgraph.edge_handle(cur_handle, prev_handle);
-                if (!path_edge_set.count(make_pair(make_pair(subgraph.get_id(edge.first), subgraph.get_is_reverse(edge.first)),
-                                                   make_pair(subgraph.get_id(edge.second), subgraph.get_is_reverse(edge.second))))) {
+                edge_t edge = vg_subgraph->edge_handle(cur_handle, prev_handle);
+                if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)),
+                                                   make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) {
 #ifdef debug
 #pragma omp critical(cerr)
                     {
@@ -150,13 +160,13 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length
         cur_it = end_it;
         prev_it = cur_it;
         for (++cur_it; cur_it != mappings.end(); ++prev_it, ++cur_it) {
-            handle_t  prev_handle = subgraph.get_handle(prev_it->node_id(),
+            handle_t  prev_handle = vg_subgraph->get_handle(prev_it->node_id(),
                                                         prev_it->is_reverse());
-            handle_t cur_handle = subgraph.get_handle(cur_it->node_id(),
+            handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(),
                                                       cur_it->is_reverse());
-            edge_t edge = subgraph.edge_handle(prev_handle, cur_handle);
-            if (!path_edge_set.count(make_pair(make_pair(subgraph.get_id(edge.first), subgraph.get_is_reverse(edge.first)),
-                                               make_pair(subgraph.get_id(edge.second), subgraph.get_is_reverse(edge.second))))) {
+            edge_t edge = vg_subgraph->edge_handle(prev_handle, cur_handle);
+            if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)),
+                                               make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) {
 #ifdef debug
 #pragma omp critical(cerr)
                     {
@@ -192,64 +202,70 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length
 
     // Cut our graph so that our reference path end points are graph tips.  This will let the
     // snarl finder use the path to find telomeres.
-    path_handle_t sg_path_handle = subgraph.get_path_handle(region.seq);
-    Node* start_node = subgraph.get_node(mappings.begin()->node_id());
-    auto sg_start_steps = path_steps_of_handle(subgraph, subgraph.get_handle(start_node->id()), sg_path_handle); 
+    path_handle_t sg_path_handle = vg_subgraph->get_path_handle(region.seq);
+    Node* start_node = vg_subgraph->get_node(mappings.begin()->node_id());
+    auto sg_start_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(start_node->id()), sg_path_handle); 
     if (rewrite_paths && sg_start_steps.size() == 1) {
-        if (!mappings.begin()->is_reverse() && subgraph.start_degree(start_node) != 0) {
-            for (auto edge : subgraph.edges_to(start_node)) {
+        if (!mappings.begin()->is_reverse() && vg_subgraph->start_degree(start_node) != 0) {
+            for (auto edge : vg_subgraph->edges_to(start_node)) {
 #ifdef debug
 #pragma omp crticial(cerr)
                 {
                     cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl;
                 }
 #endif
-                subgraph.destroy_edge(edge);
+                vg_subgraph->destroy_edge(edge);
             }
-        } else if (mappings.begin()->is_reverse() && subgraph.end_degree(start_node) != 0) {
-            for (auto edge : subgraph.edges_from(start_node)) {
+        } else if (mappings.begin()->is_reverse() && vg_subgraph->end_degree(start_node) != 0) {
+            for (auto edge : vg_subgraph->edges_from(start_node)) {
 #ifdef debug
 #pragma omp crticial(cerr)
                 {
                     cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl;
                 }
 #endif
-                subgraph.destroy_edge(edge);
+                vg_subgraph->destroy_edge(edge);
             }
         }
     }
-    Node* end_node = subgraph.get_node(mappings.rbegin()->node_id());
-    auto sg_end_steps = path_steps_of_handle(subgraph, subgraph.get_handle(end_node->id()), sg_path_handle); 
+    Node* end_node = vg_subgraph->get_node(mappings.rbegin()->node_id());
+    auto sg_end_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(end_node->id()), sg_path_handle); 
     if (rewrite_paths && sg_end_steps.size() == 1) {
-        if (!mappings.rbegin()->is_reverse() && subgraph.end_degree(end_node) != 0) {
-            for (auto edge : subgraph.edges_from(end_node)) {
+        if (!mappings.rbegin()->is_reverse() && vg_subgraph->end_degree(end_node) != 0) {
+            for (auto edge : vg_subgraph->edges_from(end_node)) {
 #ifdef debug
 #pragma omp crticial(cerr)
                 {
                     cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl;
                 }
 #endif
-                subgraph.destroy_edge(edge);
+                vg_subgraph->destroy_edge(edge);
             }
-        } else if (mappings.rbegin()->is_reverse() && subgraph.start_degree(end_node) != 0) {
-            for (auto edge : subgraph.edges_to(end_node)) {
+        } else if (mappings.rbegin()->is_reverse() && vg_subgraph->start_degree(end_node) != 0) {
+            for (auto edge : vg_subgraph->edges_to(end_node)) {
 #ifdef debug
 #pragma omp crticial(cerr)
                 {
                     cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl;
                 }
 #endif
-                subgraph.destroy_edge(edge);
+                vg_subgraph->destroy_edge(edge);
             }
         }
     }
 
     // Sync our updated paths lists back into the Graph protobuf
     if (rewrite_paths) {
-        subgraph.paths.rebuild_node_mapping();
-        subgraph.paths.rebuild_mapping_aux();
-        subgraph.graph.clear_path();
-        subgraph.paths.to_graph(subgraph.graph);
+        vg_subgraph->paths.rebuild_node_mapping();
+        vg_subgraph->paths.rebuild_mapping_aux();
+        vg_subgraph->graph.clear_path();
+        vg_subgraph->paths.to_graph(vg_subgraph->graph);
+    }
+
+    // copy back out of vg if necessary
+    if (dynamic_cast<VG*>(&subgraph) == nullptr) {
+        convert_path_handle_graph(vg_subgraph, &subgraph);
+        delete vg_subgraph;
     }
 
     // start could fall inside a node.  we find out where in the path the
@@ -262,32 +278,22 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length
 }
 
 void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int context, int length,
-                                   bool forward_only, VG& subgraph,
+                                   bool forward_only, MutablePathMutableHandleGraph& subgraph,
                                    Region& out_region) {
 
-    Graph g;
-
     for (vg::id_t i = start; i <= end; ++i) {
-        Node node;
-        node.set_id(i);
-        node.set_sequence(graph->get_sequence(graph->get_handle(i)));
-        *g.add_node() = node;
+        subgraph.create_handle(graph->get_sequence(graph->get_handle(i)), i);
     }
 
-    VG vg_g(g);
-    
     // expand the context and get path information
     // if forward_only true, then we only go forward.
-    algorithms::expand_subgraph_by_steps(*graph, vg_g, context, forward_only);
+    algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only);
     if (length) {
-        algorithms::expand_subgraph_by_length(*graph, vg_g, context, forward_only);
+        algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only);
     }
-    algorithms::add_subpaths_to_subgraph(*graph, vg_g);
+    algorithms::add_subpaths_to_subgraph(*graph, subgraph);
 
     // build the vg
-    subgraph.extend(vg_g);
-    subgraph.remove_orphan_edges();
-
     out_region.start = subgraph.min_node_id();
     out_region.end = subgraph.max_node_id();
 }
diff --git a/src/chunker.hpp b/src/chunker.hpp
index 169d083e16b..42829542ee8 100644
--- a/src/chunker.hpp
+++ b/src/chunker.hpp
@@ -40,13 +40,13 @@ class PathChunker {
      * inclusive. 
      * */
     void extract_subgraph(const Region& region, int context, int length, bool forward_only,
-                          VG& subgraph, Region& out_region);
+                          MutablePathMutableHandleGraph& subgraph, Region& out_region);
 
     /**
      * Like above, but use (inclusive) id range instead of region on path.
      */
     void extract_id_range(vg::id_t start, vg::id_t end, int context, int length, bool forward_only,
-                         VG& subgraph, Region& out_region);
+                         MutablePathMutableHandleGraph& subgraph, Region& out_region);
 
     /**
      * Get a set of all edges in the graph along a path region (to check for discontinuities later on)
diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp
index 9085ac5d3c1..ed9c1de3b9d 100644
--- a/src/io/save_handle_graph.hpp
+++ b/src/io/save_handle_graph.hpp
@@ -56,7 +56,26 @@ inline void save_handle_graph(HandleGraph* graph, const string& dest_path) {
         throw runtime_error("Internal error: unable to serialize graph");
     }    
 }
-    
+
+// Check that output format specifier is a valid graph type
+inline bool valid_output_format(const string& fmt_string) {
+    return fmt_string == "vg" || fmt_string == "pg" || fmt_string == "hg";
+}
+
+// Create a new graph (of handle graph type T) where the implementation is chosen using the format string
+template<class T>
+T* new_output_graph(const string& fmt_string) {
+    if (fmt_string == "vg") {
+        return new VG();
+    } else if (fmt_string == "pg") {
+        return new bdsg::PackedGraph();
+    } else if (fmt_string == "hg") {
+        return new bdsg::HashGraph();
+    } else {
+        return nullptr;
+    }
+}
+
 }
 
 }
diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp
index 225b9a20491..856d5e28bac 100644
--- a/src/subcommand/chunk_main.cpp
+++ b/src/subcommand/chunk_main.cpp
@@ -22,6 +22,8 @@
 #include "../haplotype_extracter.hpp"
 #include "../algorithms/sorted_id_ranges.hpp"
 #include <bdsg/overlay_helper.hpp>
+#include "../io/save_handle_graph.hpp"
+#include "convert_handle.hpp"
 
 using namespace std;
 using namespace vg;
@@ -68,6 +70,7 @@ void help_chunk(char** argv) {
          << "    -T, --trace              trace haplotype threads in chunks (and only expand forward from input coordinates)." << endl
          << "                             Produces a .annotate.txt file with haplotype frequencies for each chunk." << endl 
          << "    -f, --fully-contained    only return GAM alignments that are fully contained within chunk" << endl
+         << "    -O, --output-fmt         Specifiy output format (vg, pg, hg).  [VG]" << endl
          << "    -t, --threads N          for tasks that can be done in parallel, use this many threads [1]" << endl
          << "    -h, --help" << endl;
 }
@@ -100,6 +103,7 @@ int main_chunk(int argc, char** argv) {
     bool fully_contained = false;
     int n_chunks = 0;
     size_t gam_split_size = 0;
+    string output_format = "vg";
     
     int c;
     optind = 2; // force optind past command positional argument
@@ -127,11 +131,12 @@ int main_chunk(int argc, char** argv) {
             {"n-chunks", required_argument, 0, 'n'},
             {"context-length", required_argument, 0, 'l'},
             {"gam-split-size", required_argument, 0, 'm'},
+            {"output-fmt", required_argument, 0, 'O'},
             {0, 0, 0, 0}
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:",
+        c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:O:",
                 long_options, &option_index);
 
 
@@ -226,6 +231,10 @@ int main_chunk(int argc, char** argv) {
             threads = parse<int>(optarg);
             break;
 
+        case 'O':
+            output_format = optarg;
+            break;
+
         case 'h':
         case '?':
             help_chunk(argv);
@@ -263,6 +272,13 @@ int main_chunk(int argc, char** argv) {
         }
     }
 
+    // check the output format
+    std::transform(output_format.begin(), output_format.end(), output_format.begin(), ::tolower);
+    if (!vg::io::valid_output_format(output_format)) {
+        cerr << "error[vg chunk]: invalid ouput format" << endl;
+        return 1;
+    }
+
     // figure out which outputs we want.  the graph always
     // needs to be chunked, even if only gam output is requested,
     // because we use the graph to get the nodes we're looking for.
@@ -518,15 +534,16 @@ int main_chunk(int argc, char** argv) {
         int tid = omp_get_thread_num();
         Region& region = regions[i];
         PathChunker& chunker = chunkers[tid];
-        VG* subgraph = NULL;
+        MutablePathMutableHandleGraph* subgraph = NULL;
         map<string, int> trace_thread_frequencies;
         if (id_range == false) {
-            subgraph = new VG();
+            subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
             chunker.extract_subgraph(region, context_steps, context_length,
                                      trace, *subgraph, output_regions[i]);
+            
         } else {
             if (chunk_graph || context_steps > 0) {
-                subgraph = new VG();
+                subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
                 output_regions[i].seq = region.seq;
                 chunker.extract_id_range(region.start, region.end,
                                          context_steps, context_length, trace,
@@ -556,9 +573,23 @@ int main_chunk(int argc, char** argv) {
             Graph g;
             trace_haplotypes_and_paths(*graph, *gbwt_index.get(), trace_start, trace_steps,
                                        g, trace_thread_frequencies, false);
-            subgraph->paths.for_each([&trace_thread_frequencies](const Path& path) {
-                    trace_thread_frequencies[path.name()] = 1;});            
-            subgraph->extend(g);
+            subgraph->for_each_path_handle([&trace_thread_frequencies, &subgraph](path_handle_t path_handle) {
+                    trace_thread_frequencies[subgraph->get_path_name(path_handle)] = 1;});
+            VG* vg_subgraph = dynamic_cast<VG*>(subgraph);
+            if (vg_subgraph != nullptr) {
+                // our graph is in vg format, just extend it
+                vg_subgraph->extend(g);
+            } else {
+                // our graph is not in vg format.  covert it, extend it, convert it back
+                // this can eventually be avoided by handlifying the haplotype tracer
+                vg_subgraph = new VG();
+                convert_path_handle_graph(subgraph, vg_subgraph);
+                delete subgraph;
+                vg_subgraph->extend(g);
+                subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
+                convert_path_handle_graph(vg_subgraph, subgraph);
+                delete vg_subgraph;
+            }
         }
 
         ofstream out_file;
@@ -580,8 +611,8 @@ int main_chunk(int argc, char** argv) {
                 }
                 out_stream = &out_file;
             }
-            
-            subgraph->serialize_to_ostream(*out_stream);
+
+            vg::io::save_handle_graph(subgraph, *out_stream);
         }
         
         // optional gam chunking
diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t
index 6a59711bc9a..514313b22a6 100644
--- a/test/t/30_vg_chunk.t
+++ b/test/t/30_vg_chunk.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 16
+plan tests 17
 
 # Construct a graph with alt paths so we can make a gPBWT and later a GBWT
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg
@@ -19,6 +19,9 @@ is $(vg chunk -x x.xg -p x -c 10| vg stats - -E) 291 "vg chunk with no options p
 # check a small chunk
 is $(vg chunk -x x.xg -p x:20-30 -c 0 | vg view - -j | jq -c '.path[0].mapping[].position' | jq 'select ((.node_id == "9"))' | grep node | sed s/,// | sort | uniq | wc -l) 1 "chunk has path going through node 9"
 
+# check a small chunk, but using vg input and packed graph output
+is $(vg chunk -x x.vg -p x:20-30 -c 0 -O pg | vg convert -v - | vg view - -j | jq -c '.path[0].mapping[].position' | jq 'select ((.node_id == "9"))' | grep node | sed s/,// | sort | uniq | wc -l) 1 "chunk has path going through node 9"
+
 # check no crash when using chunk_size, and filenames deterministic
 rm -f _chunk_test*
 vg chunk -x x.xg -p x -s 233 -o 50 -b _chunk_test -c 0 -t 2

From 245690d254c82755c1f2d6b7262287e0905c61ff Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 22 Nov 2019 13:32:33 -0500
Subject: [PATCH 63/79] add components chunking to chunk and deprecate explode

---
 src/algorithms/subgraph.cpp     |   2 +-
 src/chunker.cpp                 |  28 +++++++-
 src/chunker.hpp                 |  18 +++--
 src/subcommand/chunk_main.cpp   | 119 +++++++++++++++++++++++---------
 src/subcommand/explode_main.cpp |   4 ++
 test/t/30_vg_chunk.t            |  30 +++++++-
 6 files changed, 159 insertions(+), 42 deletions(-)

diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp
index cc3e388e7b1..c7a11ae00ca 100644
--- a/src/algorithms/subgraph.cpp
+++ b/src/algorithms/subgraph.cpp
@@ -8,7 +8,7 @@ void expand_subgraph_by_steps(const HandleGraph& source, MutableHandleGraph& sub
     subgraph.for_each_handle([&](const handle_t& h) {
             curr_handles.push_back(h);
         });
-    for (uint64_t i = 0; i < steps; ++i) {
+    for (uint64_t i = 0; i < steps && !curr_handles.empty(); ++i) {
         std::vector<handle_t> next_handles;
         for (auto& h : curr_handles) {
             handle_t old_h = source.get_handle(subgraph.get_id(h));
diff --git a/src/chunker.cpp b/src/chunker.cpp
index 6bea18f6087..70990596f21 100644
--- a/src/chunker.cpp
+++ b/src/chunker.cpp
@@ -19,7 +19,7 @@ PathChunker::~PathChunker() {
 
 }
 
-void PathChunker::extract_subgraph(const Region& region, int context, int length, bool forward_only,
+void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only,
                                    MutablePathMutableHandleGraph& subgraph, Region& out_region) {
     // This method still depends on VG
     // (not a super high priority to port, as calling can now be done at genome scale and we no longer
@@ -277,7 +277,29 @@ void PathChunker::extract_subgraph(const Region& region, int context, int length
     out_region.end = input_end_pos + graph->get_length(end_handle) + right_padding - 1;
 }
 
-void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int context, int length,
+void PathChunker::extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region) {
+    unordered_set<nid_t> path_ids;
+
+    path_handle_t path_handle = graph->get_path_handle(path_name);
+    for (handle_t handle : graph->scan_path(path_handle)) {
+        path_ids.insert(graph->get_id(handle));
+    }
+    
+    extract_component(path_ids, subgraph);
+    out_region.seq = path_name;
+}
+
+void PathChunker::extract_component(const unordered_set<nid_t>& node_ids, MutablePathMutableHandleGraph& subgraph) {
+
+    for (nid_t node_id : node_ids) {
+        subgraph.create_handle(graph->get_sequence(graph->get_handle(node_id)), node_id);
+    }
+
+    algorithms::expand_subgraph_by_steps(*graph, subgraph, numeric_limits<uint64_t>::max());
+    algorithms::add_subpaths_to_subgraph(*graph, subgraph);
+}
+
+void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length,
                                    bool forward_only, MutablePathMutableHandleGraph& subgraph,
                                    Region& out_region) {
 
@@ -299,7 +321,7 @@ void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int context, in
 }
 
 set<pair<pair<id_t, bool>, pair<id_t, bool>>> PathChunker::get_path_edge_index(step_handle_t start_step,
-                                                                               step_handle_t end_step, int context) const {
+                                                                               step_handle_t end_step, int64_t context) const {
     // we don't use handles as we're going to use this structure to compare edges across different graphs
     set<pair<pair<id_t, bool>, pair<id_t, bool>>> path_edges;
 
diff --git a/src/chunker.hpp b/src/chunker.hpp
index 42829542ee8..02cc895a726 100644
--- a/src/chunker.hpp
+++ b/src/chunker.hpp
@@ -39,20 +39,30 @@ class PathChunker {
      * NOTE: we follow convention of Region coordinates being 0-based 
      * inclusive. 
      * */
-    void extract_subgraph(const Region& region, int context, int length, bool forward_only,
+    void extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only,
                           MutablePathMutableHandleGraph& subgraph, Region& out_region);
 
+    /**
+     * Extract a connected component containing a given path
+     */
+    void extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region);
+   
+    /**
+     * Extract a connected component starting from an id set
+     */
+    void extract_component(const unordered_set<nid_t>& node_ids, MutablePathMutableHandleGraph& subgraph);   
+
     /**
      * Like above, but use (inclusive) id range instead of region on path.
      */
-    void extract_id_range(vg::id_t start, vg::id_t end, int context, int length, bool forward_only,
-                         MutablePathMutableHandleGraph& subgraph, Region& out_region);
+    void extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, bool forward_only,
+                          MutablePathMutableHandleGraph& subgraph, Region& out_region);
 
     /**
      * Get a set of all edges in the graph along a path region (to check for discontinuities later on)
      */ 
     set<pair<pair<id_t, bool>, pair<id_t, bool>>> get_path_edge_index(step_handle_t start_step,
-                                                                      step_handle_t end_step, int context) const;
+                                                                      step_handle_t end_step, int64_t context) const;
 
 };
 
diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp
index 856d5e28bac..eac1188e84f 100644
--- a/src/subcommand/chunk_main.cpp
+++ b/src/subcommand/chunk_main.cpp
@@ -21,6 +21,7 @@
 #include "../region.hpp"
 #include "../haplotype_extracter.hpp"
 #include "../algorithms/sorted_id_ranges.hpp"
+#include "../algorithms/weakly_connected_components.hpp"
 #include <bdsg/overlay_helper.hpp>
 #include "../io/save_handle_graph.hpp"
 #include "convert_handle.hpp"
@@ -29,7 +30,7 @@ using namespace std;
 using namespace vg;
 using namespace vg::subcommand;
 
-static string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi = 0);
+static string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi = 0, bool components = false);
 static int split_gam(istream& gam_stream, size_t chunk_size, const string& out_prefix,
                      size_t gam_buffer_size = 100);
 
@@ -43,12 +44,12 @@ void help_chunk(char** argv) {
          << "For a single-range chunk (-p or -r), the graph data is sent to standard output instead of a file." << endl
          << endl
          << "options:" << endl
-         << "    -x, --xg-name FILE       use this xg index to chunk subgraphs" << endl
+         << "    -x, --xg-name FILE       use this graph or xg index to chunk subgraphs" << endl
          << "    -G, --gbwt-name FILE     use this GBWT haplotype index for haplotype extraction" << endl
          << "    -a, --gam-name FILE      chunk this gam file (not stdin, sorted, with FILE.gai index) instead of the graph (multiple allowed)" << endl
          << "    -g, --gam-and-graph      when used in combination with -a, both gam and graph will be chunked" << endl 
          << "path chunking:" << endl
-         << "    -p, --path TARGET        write the chunk in the specified (0-based inclusive)\n"
+         << "    -p, --path TARGET        write the chunk in the specified (0-based inclusive, multiple allowed)\n"
          << "                             path range TARGET=path[:pos1[-pos2]] to standard output" << endl
          << "    -P, --path-list FILE     write chunks for all path regions in (line - separated file). format" << endl
          << "                             for each as in -p (all paths chunked unless otherwise specified)" << endl
@@ -59,6 +60,9 @@ void help_chunk(char** argv) {
          << "    -n, --n-chunks N         generate this many id-range chunks, which are determined using the xg index" << endl
          << "simple gam chunking:" << endl
          << "    -m, --gam-split-size N   split gam (specified with -a, sort/index not required) up into chunks with at most N reads each" << endl
+         << "component chunking:" << endl
+         << "    -C, --components         create a chunk for each connected component.  if a targets given with (-p, -P, -r, -R), limit to components containing them" << endl
+         << "    -M, --path-components    create a chunk for each path in the graph's connected component" << endl
          << "general:" << endl
          << "    -s, --chunk-size N       create chunks spanning N bases (or nodes with -r/-R) for all input regions." << endl
          << "    -o, --overlap N          overlap between chunks when using -s [0]" << endl        
@@ -86,7 +90,7 @@ int main_chunk(int argc, char** argv) {
     string gbwt_file;
     vector<string> gam_files;
     bool gam_and_graph = false;
-    string region_string;
+    vector<string> region_strings;
     string path_list_file;
     int chunk_size = 0;
     int overlap = 0;
@@ -104,6 +108,8 @@ int main_chunk(int argc, char** argv) {
     int n_chunks = 0;
     size_t gam_split_size = 0;
     string output_format = "vg";
+    bool components = false;
+    bool path_components = false;
     
     int c;
     optind = 2; // force optind past command positional argument
@@ -131,12 +137,14 @@ int main_chunk(int argc, char** argv) {
             {"n-chunks", required_argument, 0, 'n'},
             {"context-length", required_argument, 0, 'l'},
             {"gam-split-size", required_argument, 0, 'm'},
+            {"components", no_argument, 0, 'C'},
+            {"path-components", no_argument, 0, 'M'},
             {"output-fmt", required_argument, 0, 'O'},
             {0, 0, 0, 0}
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:O:",
+        c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:CMO:",
                 long_options, &option_index);
 
 
@@ -164,7 +172,7 @@ int main_chunk(int argc, char** argv) {
             break;            
 
         case 'p':
-            region_string = optarg;
+            region_strings.push_back(optarg);
             break;
 
         case 'P':
@@ -219,6 +227,15 @@ int main_chunk(int argc, char** argv) {
             gam_split_size = parse<int>(optarg);
             break;
 
+        case 'C':
+            components = true;
+            break;
+
+        case 'M':
+            components = true;
+            path_components = true;
+            break;
+
         case 'T':
             trace = true;
             break;
@@ -249,9 +266,10 @@ int main_chunk(int argc, char** argv) {
     omp_set_num_threads(threads);            
 
     // need at most one of -n, -p, -P, -e, -r, -R, -m  as an input
-    if ((n_chunks == 0 ? 0 : 1) + (region_string.empty() ? 0 : 1) + (path_list_file.empty() ? 0 : 1) + (in_bed_file.empty() ? 0 : 1) +
-        (node_ranges_file.empty() ? 0 : 1) + (node_range_string.empty() ? 0 : 1) + (gam_split_size == 0 ? 0 : 1) > 1) {
-        cerr << "error:[vg chunk] at most one of {-n, -p, -P, -e, -r, -R, m} required to specify input regions" << endl;
+    if ((n_chunks == 0 ? 0 : 1) + (region_strings.empty() ? 0 : 1) + (path_list_file.empty() ? 0 : 1) + (in_bed_file.empty() ? 0 : 1) +
+        (node_ranges_file.empty() ? 0 : 1) + (node_range_string.empty() ? 0 : 1) + (gam_split_size == 0 ? 0 : 1)  +
+        (path_components ? 1 : 0) > 1) {
+        cerr << "error:[vg chunk] at most one of {-n, -p, -P, -e, -r, -R, -m, -P} required to specify input regions" << endl;
         return 1;
     }
     // need -a if using -f
@@ -259,6 +277,10 @@ int main_chunk(int argc, char** argv) {
         cerr << "error:[vg chunk] gam file must be specified with -a when using -f or -m" << endl;
         return 1;
     }
+    if (components == true && context_steps >= 0) {
+        cerr << "error:[vg chunk] context cannot be specified (-c) when splitting into components (-C)" << endl;
+        return 1;
+    }
     // context steps default to 1 if using id_ranges.  otherwise, force user to specify to avoid
     // misunderstandings
     if (context_steps < 0 && gam_split_size == 0) {
@@ -266,7 +288,7 @@ int main_chunk(int argc, char** argv) {
             if (!context_length) {
                 context_steps = 1;
             }
-        } else {
+        } else if (!components){
             cerr << "error:[vg chunk] context expansion steps must be specified with -c/--context when chunking on paths" << endl;
             return 1;
         }
@@ -291,15 +313,15 @@ int main_chunk(int argc, char** argv) {
     unique_ptr<PathHandleGraph> path_handle_graph;
     bdsg::PathPositionOverlayHelper overlay_helper;
 
-    if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0)) {
+    if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0) || components) {
         if (xg_file.empty()) {
-            cerr << "error:[vg chunk] xg index (-x) required" << endl;
+            cerr << "error:[vg chunk] graph or xg index (-x) required" << endl;
             return 1;
         }
 
         ifstream in(xg_file.c_str());
         if (!in) {
-            cerr << "error:[vg chunk] unable to load xg index file " << xg_file << endl;
+            cerr << "error:[vg chunk] unable to load graph / xg index file " << xg_file << endl;
             return 1;
         }
         
@@ -337,10 +359,12 @@ int main_chunk(int argc, char** argv) {
     
     // parse the regions into a list
     vector<Region> regions;
-    if (!region_string.empty()) {
-        Region region;
-        parse_region(region_string, region);
-        regions.push_back(region);
+    if (!region_strings.empty()) {
+        for (auto& region_string : region_strings) {
+            Region region;
+            parse_region(region_string, region);
+            regions.push_back(region);
+        }
     }
     else if (!path_list_file.empty()) {
         ifstream pr_stream(path_list_file.c_str());
@@ -417,7 +441,7 @@ int main_chunk(int argc, char** argv) {
             delete range_stream;
         }
     }
-    else if (graph != nullptr) {
+    else if (graph != nullptr && (!components || path_components)) {
         // every path
         graph->for_each_path_handle([&](path_handle_t path_handle) {
                 Region region;
@@ -443,7 +467,7 @@ int main_chunk(int argc, char** argv) {
             region.start = max((int64_t)0, region.start);
             if (region.end == -1) {
                 region.end = get_path_length(region.seq) - 1;
-            } else if (!id_range) {
+            } else if (!id_range && !components) {
                 if (region.start < 0 || region.end >= get_path_length(region.seq)) {
                     cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end
                          << " is out of bounds of path " << region.seq << " which has length "<< get_path_length(region.seq)
@@ -472,6 +496,20 @@ int main_chunk(int argc, char** argv) {
         swap(regions, chunked_regions);
     }
 
+    // when using -C for components, regions will be derived from the connected components
+    vector<unordered_set<nid_t>> component_ids; 
+    if (components == true && regions.empty()) {
+        // no regions given, we find our components from scratch and make some dummy regions
+        component_ids = algorithms::weakly_connected_components(graph);
+        for (int i = 0; i < component_ids.size(); ++i) {
+            Region region;
+            region.seq = "";
+            region.start = 0;
+            region.end = 0;
+            regions.push_back(region);
+        }
+    }
+
     // now ready to get our chunk on
     if (gam_split_size != 0) {
         for (size_t gi = 0; gi < gam_files.size(); ++gi) {
@@ -536,17 +574,26 @@ int main_chunk(int argc, char** argv) {
         PathChunker& chunker = chunkers[tid];
         MutablePathMutableHandleGraph* subgraph = NULL;
         map<string, int> trace_thread_frequencies;
-        if (id_range == false) {
+        if (!component_ids.empty()) {
             subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
-            chunker.extract_subgraph(region, context_steps, context_length,
-                                     trace, *subgraph, output_regions[i]);
-            
+            chunker.extract_component(component_ids[i], *subgraph);
+            output_regions[i] = region;
+        }
+        else if (id_range == false) {
+            subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
+            if (components == true) {
+                chunker.extract_path_component(region.seq, *subgraph, output_regions[i]);
+            } else {
+                chunker.extract_subgraph(region, context_steps, context_length,
+                                         trace, *subgraph, output_regions[i]);
+            }
         } else {
             if (chunk_graph || context_steps > 0) {
                 subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
                 output_regions[i].seq = region.seq;
                 chunker.extract_id_range(region.start, region.end,
-                                         context_steps, context_length, trace,
+                                         components ? numeric_limits<int64_t>::max() : context_steps,
+                                         context_length, trace && !components,
                                          *subgraph, output_regions[i]);
             } else {
                 // in this case, there's no need to actually build the subgraph, so we don't
@@ -595,7 +642,7 @@ int main_chunk(int argc, char** argv) {
         ofstream out_file;
         ostream* out_stream = NULL;
         if (chunk_graph) {
-            if ((!region_string.empty() || !node_range_string.empty()) &&
+            if ((!region_strings.empty() || !node_range_string.empty()) &&
                 (regions.size()  == 1) && chunk_size == 0) {
                 // If we are going to output only one chunk, it should go to
                 // stdout instead of to a file on disk
@@ -603,7 +650,7 @@ int main_chunk(int argc, char** argv) {
             } else {
                 // Otherwise, we write files under the specified prefix, using
                 // a prefix-i-seq-start-end convention.
-                string name = chunk_name(out_chunk_prefix, i, output_regions[i], ".vg");
+                string name = chunk_name(out_chunk_prefix, i, output_regions[i], "." + output_format, 0, components);
                 out_file.open(name);
                 if (!out_file) {
                     cerr << "error[vg chunk]: can't open output chunk file " << name << endl;
@@ -622,7 +669,7 @@ int main_chunk(int argc, char** argv) {
                 assert(gam_index.get() != nullptr);
                 GAMIndex::cursor_t& cursor = cursors_vec[gi][tid];
             
-                string gam_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gam", gi);
+                string gam_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gam", gi, components);
                 ofstream out_gam_file(gam_name);
                 if (!out_gam_file) {
                     cerr << "error[vg chunk]: can't open output gam file " << gam_name << endl;
@@ -647,7 +694,7 @@ int main_chunk(int argc, char** argv) {
         if (trace) {
             // Even if we have only one chunk, the trace annotation data always
             // ends up in a file.
-            string annot_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".annotate.txt");
+            string annot_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".annotate.txt", 0, components);
             ofstream out_annot_file(annot_name);
             if (!out_annot_file) {
                 cerr << "error[vg chunk]: can't open output trace annotation file " << annot_name << endl;
@@ -671,9 +718,9 @@ int main_chunk(int argc, char** argv) {
             const Region& oregion = output_regions[i];
             string seq = id_range ? "ids" : oregion.seq;
             obed << seq << "\t" << oregion.start << "\t" << (oregion.end + 1)
-                 << "\t" << chunk_name(out_chunk_prefix, i, oregion, chunk_gam ? ".gam" : ".vg");
+                 << "\t" << chunk_name(out_chunk_prefix, i, oregion, chunk_gam ? ".gam" : "." + output_format, 0, components);
             if (trace) {
-                obed << "\t" << chunk_name(out_chunk_prefix, i, oregion, ".annotate.txt");
+                obed << "\t" << chunk_name(out_chunk_prefix, i, oregion, ".annotate.txt", 0, components);
             }
             obed << "\n";
         }
@@ -686,15 +733,21 @@ int main_chunk(int argc, char** argv) {
 static Subcommand vg_chunk("chunk", "split graph or alignment into chunks", main_chunk);
 
 // Output name of a chunk
-string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi) {
+string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi, bool components) {
     stringstream chunk_name;
     string seq = region.seq.empty() ? "ids" : region.seq;
     chunk_name << out_chunk_prefix;
     if (gi > 0) {
         chunk_name << "-" << gi;
     }
-    chunk_name << "_" << i << "_" << seq << "_"
-               << region.start << "_" << region.end << ext;
+    if (!components) {
+        chunk_name << "_" << i << "_" << seq << "_" << region.start << "_" << region.end;
+    } else if (region.seq.empty()) {
+        chunk_name << "_" << i;
+    } else {
+        chunk_name << "_" << region.seq;
+    }
+    chunk_name << ext;
     return chunk_name.str();
 }
 
diff --git a/src/subcommand/explode_main.cpp b/src/subcommand/explode_main.cpp
index b0ef45ee86a..aca755f0e5c 100644
--- a/src/subcommand/explode_main.cpp
+++ b/src/subcommand/explode_main.cpp
@@ -71,6 +71,10 @@ int main_explode(int argc, char** argv) {
         }
     }
 
+    cerr << "vg explode is deprecated.  Please use \"vg chunk -C source.vg -b part_dir/component\" for same* functionality as \"vg explode source.vg part_dir\"" << endl
+         << " * (unlike explode, the output directory must already exist when running chunk, though)" << endl;
+    return 1;
+
     VG* graph;
     get_input_file(optind, argc, argv, [&](istream& in) {
         graph = new VG(in);
diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t
index 514313b22a6..4a2e6a3b5c4 100644
--- a/test/t/30_vg_chunk.t
+++ b/test/t/30_vg_chunk.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 17
+plan tests 20
 
 # Construct a graph with alt paths so we can make a gPBWT and later a GBWT
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg
@@ -61,3 +61,31 @@ is $(cat x.chunk/*vg | vg view -V - | grep -v P 2>/dev/null | sort |  md5sum | c
 rm -rf x.sorted.gam x.sorted.gam.gai _chunk_test_bed.bed _chunk_test* x.chunk
 rm -f x.vg x.xg x.gbwt x.gam.json filter_chunk*.gam chunks.bed
 rm -f chunk_*.annotate.txt
+
+vg construct -r small/xy.fa -v small/xy.vcf.gz > xy.vg
+vg construct -r small/xy.fa -v small/xy.vcf.gz -R x > x.vg
+vg construct -r small/xy.fa -v small/xy.vcf.gz -R y > y.vg
+# test that exploding into components works
+vg chunk -x xy.vg -M -b path_chunk -O hg
+vg view x.vg | grep "^S" | awk '{print $3}' | sort > x_nodes.txt
+vg view y.vg | grep "^S" | awk '{print $3}' | sort > y_nodes.txt
+vg convert path_chunk_x.hg -v | vg view - | grep "^S" | awk '{print $3}' | sort > pc_x_nodes.txt
+vg convert path_chunk_y.hg -v | vg view - | grep "^S" | awk '{print $3}' | sort > pc_y_nodes.txt
+diff x_nodes.txt pc_x_nodes.txt && diff y_nodes.txt pc_y_nodes.txt
+is "$?" 0 "path-based components finds subgraphs"
+vg paths -v x.vg -E > x_paths.txt
+vg paths -v path_chunk_x.hg -E > pc_x_paths.txt
+diff pc_x_paths.txt x_paths.txt
+is "$?" 0 "path-based component contains correct path length"
+vg chunk -x xy.vg -C -b components_chunk
+vg view components_chunk_0.vg | grep "^S" | awk '{print $3}' > comp_0_nodes.txt
+vg view components_chunk_1.vg | grep "^S" | awk '{print $3}' > comp_1_nodes.txt
+cat comp_0_nodes.txt comp_1_nodes.txt | sort > comp_nodes.txt
+cat x_nodes.txt y_nodes.txt | sort > nodes.txt
+diff comp_nodes.txt nodes.txt
+is "$?" 0 "components finds subgraphs"
+
+rm -f xy.vg x.vg y.vg x_nodes.txt y_nodes.txt convert path_chunk_x.hg  convert path_chunk_y.hg pc_x_nodes.txt pc_y_nodes.txt x_paths.txt pc_x_paths.txt components_chunk_0.vg components_chunk_1.vg comp_0_nodes.txt comp_1_nodes.txt comp_nodes.txt nodes.txt
+
+
+

From b27cc3c902bd58345247137ad32b0dfcf7ed1878 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 22 Nov 2019 13:37:53 -0500
Subject: [PATCH 64/79] skip alt paths

---
 src/subcommand/chunk_main.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp
index eac1188e84f..f6e01118a2d 100644
--- a/src/subcommand/chunk_main.cpp
+++ b/src/subcommand/chunk_main.cpp
@@ -446,7 +446,9 @@ int main_chunk(int argc, char** argv) {
         graph->for_each_path_handle([&](path_handle_t path_handle) {
                 Region region;
                 region.seq = graph->get_path_name(path_handle);
-                regions.push_back(region);
+                if (!Paths::is_alt(region.seq)) {
+                    regions.push_back(region);
+                }
             });
     }
 

From 5e6c7fb9973ac8b976d351acada89ca81d3e7dfd Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 26 Nov 2019 10:42:29 -0500
Subject: [PATCH 65/79] force dynamic schedule in parallel snarl iteration (for
 vg call)

---
 src/snarls.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/snarls.cpp b/src/snarls.cpp
index add633bb706..9c2e875fd5e 100644
--- a/src/snarls.cpp
+++ b/src/snarls.cpp
@@ -704,7 +704,7 @@ const vector<const Snarl*>& SnarlManager::top_level_snarls() const {
 }
     
 void SnarlManager::for_each_top_level_snarl_parallel(const function<void(const Snarl*)>& lambda) const {
-#pragma omp parallel for
+#pragma omp parallel for schedule(dynamic, 1)
     for (int i = 0; i < roots.size(); i++) {
         lambda(roots[i]);
     }

From f2bfe040dda73bd58c351ddc1413304f8e9fe9a2 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 26 Nov 2019 10:44:51 -0500
Subject: [PATCH 66/79] more careful about cycle scan

---
 src/graph_caller.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index f96de89b2fc..48c5047cf5c 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -40,7 +40,8 @@ void GraphCaller::call_top_level_snarls(bool recurse_on_fail) {
             std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue));
             thread_queue.clear();
         }
-#pragma omp parallel for
+
+#pragma omp parallel for schedule(dynamic, 1)
         for (int i = 0; i < cur_queue.size(); ++i) {
             process_snarl(cur_queue[i]);
         }
@@ -364,6 +365,7 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) {
     function<PathIndex*(const Snarl&)> get_path_index;
     VG vg_graph;
     SupportBasedSnarlCaller& support_caller = dynamic_cast<SupportBasedSnarlCaller&>(snarl_caller);
+    bool was_called = false;
     
     if (is_vg) {
         // our graph is in VG format, so we've sorted this out in the constructor
@@ -448,6 +450,8 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) {
 
             // emit our vcf variant
             emit_variant(snarl, *rep_trav_finder, called_traversals, genotype, path_name);
+
+            was_called = true;
         }
     }        
     if (!is_vg) {
@@ -458,7 +462,7 @@ bool LegacyCaller::call_snarl(const Snarl& snarl) {
         }
     }
 
-    return true;
+    return was_called;
 }
 
 string LegacyCaller::vcf_header(const PathHandleGraph& graph, const vector<string>& ref_paths,
@@ -759,15 +763,16 @@ tuple<size_t, size_t, bool> LegacyCaller::get_ref_interval(const Snarl& snarl, c
     assert(start_steps.size() > 0 && end_steps.size() > 0);
     step_handle_t start_step = start_steps.begin()->second;
     step_handle_t end_step = end_steps.begin()->second;
-    bool scan_backward = graph.get_is_reverse(graph.get_handle_of_step(start_step));
+    bool scan_backward = graph.get_is_reverse(graph.get_handle_of_step(start_step)) != snarl.start().backward();
 
     // if we're on a cycle, we keep our start step and find the end step by scanning the path
     if (start_steps.size() > 1 || end_steps.size() > 1) {
         bool found_end = false;
+
         if (scan_backward) {
             for (step_handle_t cur_step = start_step; graph.has_previous_step(end_step) && !found_end;
                  cur_step = graph.get_previous_step(cur_step)) {
-                if (graph.get_handle_of_step(cur_step) == end_handle) {
+                if (graph.get_id(graph.get_handle_of_step(cur_step)) == graph.get_id(end_handle)) {
                     end_step = cur_step;
                     found_end = true;
                 }
@@ -776,7 +781,7 @@ tuple<size_t, size_t, bool> LegacyCaller::get_ref_interval(const Snarl& snarl, c
         } else {
             for (step_handle_t cur_step = start_step; graph.has_next_step(end_step) && !found_end;
                  cur_step = graph.get_next_step(cur_step)) {
-                if (graph.get_handle_of_step(cur_step) == end_handle) {
+                if (graph.get_id(graph.get_handle_of_step(cur_step)) == graph.get_id(end_handle)) {
                     end_step = cur_step;
                     found_end = true;
                 }

From 1bfdc321cf6a1f3dc3a5d01223351b82606ccc2a Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 26 Nov 2019 12:52:17 -0500
Subject: [PATCH 67/79] sort before comparing vcf ouputs

---
 test/t/26_deconstruct.t | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t
index 25070642c7f..c1fb4cf9045 100644
--- a/test/t/26_deconstruct.t
+++ b/test/t/26_deconstruct.t
@@ -108,11 +108,11 @@ printf "P\talt2.3\t1+,2+,4+,6+,8+,9+,11+,12+,14+,15+\t8M,1M,1M,3M,1M,19M,1M,4M,1
 printf "P\talt2.3\t1+,2+,4+,6+,8+,9+,11+,12+,14+,15+\t8M,1M,1M,3M,1M,19M,1M,4M,1M,11M\n" >> tiny_names.gfa
 vg view -Fv tiny_names.gfa > tiny_names.vg
 vg index tiny_names.vg -x tiny_names.xg
-vg deconstruct tiny_names.xg -P ref -A alt1,alt2 -e > tiny_names_decon.vcf
+vg deconstruct tiny_names.xg -P ref -A alt1,alt2 -e | sort > tiny_names_decon.vcf
 is $(grep -v "#" tiny_names_decon.vcf | wc -l) 2 "-P -A options return correct number of variants"
 is $(grep -v "#" tiny_names_decon.vcf | grep ref.1 | wc -l) 2 "-P -A options use correct reference name"
 is $(grep -v "#" tiny_names_decon.vcf | grep ref.1 | grep 14 | grep "CONFLICT=alt1" | wc -l) 1 "-P -A identifies conflict in alt1 in second variant"
-vg deconstruct tiny_names.vg -P ref -A alt1,alt2 -e > tiny_names_decon_vg.vcf
+vg deconstruct tiny_names.vg -P ref -A alt1,alt2 -e | sort > tiny_names_decon_vg.vcf
 diff tiny_names_decon.vcf tiny_names_decon_vg.vcf
 is "$?" 0 "deconstructing vg graph gives same output as xg graph"
 

From eb1da3fa345d145d9469a1496aca078b1b8f44b4 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Wed, 27 Nov 2019 13:17:27 -0800
Subject: [PATCH 68/79] add feature option

---
 src/subcommand/rna_main.cpp | 10 +++++++++-
 src/transcriptome.cpp       |  4 ++--
 src/transcriptome.hpp       |  5 ++++-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp
index 1d180231ded..2a5c1d8153a 100644
--- a/src/subcommand/rna_main.cpp
+++ b/src/subcommand/rna_main.cpp
@@ -23,6 +23,7 @@ void help_rna(char** argv) {
     cerr << "\nusage: " << argv[0] << " rna [options] <graph.vg> > splice_graph.vg" << endl
          << "options:" << endl
          << "    -n, --transcripts FILE     transcript file(s) in gtf/gff format; may repeat (required)" << endl
+         << "    -y, --feature-type NAME    parse only this feature type in the gtf/gff (parse all if empty) [exon]" << endl
          << "    -s, --transcript-tag NAME  use this attribute tag in the gtf/gff file(s) as id [transcript_id]" << endl
          << "    -l, --haplotypes FILE      project transcripts onto haplotypes in GBWT index file" << endl
          << "    -e, --use-embedded-paths   project transcripts onto embedded graph paths" << endl
@@ -50,6 +51,7 @@ int32_t main_rna(int32_t argc, char** argv) {
     }
     
     vector<string> transcript_filenames;
+    string feature_type = "exon";
     string transcript_tag = "transcript_id";
     string haplotypes_filename;
     bool use_embedded_paths = false;
@@ -73,6 +75,7 @@ int32_t main_rna(int32_t argc, char** argv) {
         static struct option long_options[] =
             {
                 {"transcripts",  no_argument, 0, 'n'},
+                {"feature-type",  no_argument, 0, 'y'},
                 {"transcript-tag",  no_argument, 0, 's'},
                 {"haplotypes",  no_argument, 0, 'l'},
                 {"use-embeded-paths",  no_argument, 0, 'e'},
@@ -93,7 +96,7 @@ int32_t main_rna(int32_t argc, char** argv) {
             };
 
         int32_t option_index = 0;
-        c = getopt_long(argc, argv, "n:s:l:ercdoraub:gf:i:t:ph?", long_options, &option_index);
+        c = getopt_long(argc, argv, "n:y:s:l:ercdoraub:gf:i:t:ph?", long_options, &option_index);
 
         /* Detect the end of the options. */
         if (c == -1)
@@ -106,6 +109,10 @@ int32_t main_rna(int32_t argc, char** argv) {
             transcript_filenames.push_back(optarg);
             break;
 
+        case 'y':
+            feature_type = optarg;
+            break;
+
         case 's':
             transcript_tag = optarg;
             break;
@@ -219,6 +226,7 @@ int32_t main_rna(int32_t argc, char** argv) {
     if (show_progress) { cerr << "[vg rna] Graph (and index) parsed in " << gcsa::readTimer() - time_parsing_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; };
 
     transcriptome.num_threads = num_threads;
+    transcriptome.feature_type = feature_type;
     transcriptome.transcript_tag = transcript_tag;
     transcriptome.use_embedded_paths = use_embedded_paths;
     transcriptome.use_reference_paths = (add_reference_transcript_paths || output_reference_transcript_paths);
diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index 7c7e4866677..d6f2edd9244 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -91,8 +91,8 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
         transcript_stream.ignore(numeric_limits<streamsize>::max(), '\t');         
         getline(transcript_stream, feature, '\t');
 
-        // Skip all non exon features, such as cds, gene etc.
-        if (feature != "exon") {
+        // Select only relevant feature types.
+        if (feature != feature_type && !feature_type.empty()) {
 
             transcript_stream.ignore(numeric_limits<streamsize>::max(), '\n');  
             continue;
diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp
index 2ace89c60af..8d4a749f5be 100644
--- a/src/transcriptome.hpp
+++ b/src/transcriptome.hpp
@@ -87,8 +87,11 @@ class Transcriptome {
         /// Number of threads used for transcript path construction. 
         int32_t num_threads = 1;
 
+        /// Feature type to parse in the gtf/gff file. Parse all types if empty. 
+        string feature_type; 
+
         /// Attribute tag used to parse the transcript id/name in the gtf/gff file. 
-        string transcript_tag = "transcript_id";
+        string transcript_tag;
 
         /// Use all paths embedded in the graph for transcript path construction. 
         bool use_embedded_paths = false;

From 026c43d89851f296a55b5facf287d118d0a40b28 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Wed, 27 Nov 2019 13:51:30 -0800
Subject: [PATCH 69/79] added gff3 support

---
 src/transcriptome.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index d6f2edd9244..6e49b573cd5 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -57,8 +57,11 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
 
     smatch regex_id_match;
 
-    // Regex used to extract transcript name/id.
-    regex regex_id_exp(transcript_tag + "\\s{1}\"?([^\"]*)\"?");
+    // Regex used to extract transcript name/id from gtf file.
+    regex regex_id_exp_gtf(transcript_tag + "\\s{1}\"?([^\"]*)\"?;?");
+
+    // Regex used to extract transcript name/id from gff file.
+    regex regex_id_exp_gff(transcript_tag + "={1}([^;]*);?");
 
     while (transcript_stream.good()) {
 
@@ -121,8 +124,15 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
 
         string transcript_id = "";
 
-        // Get transcript name/id from attribute column using regex.
-        if (std::regex_search(attributes, regex_id_match, regex_id_exp)) {
+        // Get transcript name/id from gtf attribute column using regex.
+        if (std::regex_search(attributes, regex_id_match, regex_id_exp_gtf)) {
+
+            assert(regex_id_match.size() == 2);
+            transcript_id = regex_id_match[1];
+        }
+
+        // Get transcript name/id from gff attribute column using regex.
+        if (std::regex_search(attributes, regex_id_match, regex_id_exp_gff)) {
 
             assert(regex_id_match.size() == 2);
             transcript_id = regex_id_match[1];

From 78d7e64187a1057921a4910ae1a30cb3be255670 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Wed, 27 Nov 2019 13:59:40 -0800
Subject: [PATCH 70/79] added error if no transcripts parsed

---
 src/transcriptome.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index 6e49b573cd5..115ed93c7b9 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -163,9 +163,16 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
         add_exon(&(transcripts.back()), make_pair(spos, epos), *chrom_path_index.second);
     }
 
-    reorder_exons(&transcripts.back());
+    if (transcripts.empty()) {
+
+        cerr << "[transcriptome] ERROR: No transcript where parsed (remember to set feature type \"-y\")" << endl;
+        exit(1);        
+    }
+
     delete chrom_path_index.second;
 
+    reorder_exons(&transcripts.back());
+
 #ifdef transcriptome_debug
     double time_parsing_2 = gcsa::readTimer();
     cerr << "DEBUG parsing end: " << time_parsing_2 - time_parsing_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl;

From 15b1830a7d00abef4b2a1a6981553da1445041e0 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Wed, 27 Nov 2019 14:01:12 -0800
Subject: [PATCH 71/79] fixed typo

---
 src/transcriptome.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index 115ed93c7b9..55854baa9ec 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -165,7 +165,7 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
 
     if (transcripts.empty()) {
 
-        cerr << "[transcriptome] ERROR: No transcript where parsed (remember to set feature type \"-y\")" << endl;
+        cerr << "[transcriptome] ERROR: No transcripts parsed (remember to set feature type \"-y\")" << endl;
         exit(1);        
     }
 

From 1a8ae8247cf22bb4b9f7e16a36d3d8d0567a6e7f Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Tue, 3 Dec 2019 14:41:30 +0100
Subject: [PATCH 72/79] avoid compiler warnings with an updated bbhash

---
 .gitmodules | 4 ++--
 deps/BBHash | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 1b80573d401..a4baa883612 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -122,6 +122,6 @@
 [submodule "deps/mmmultimap"]
 	path = deps/mmmultimap
 	url = https://github.com/ekg/mmmultimap.git
-[submodule "deps/BBHash"]
+[submodule "vgteam_bbhash"]
 	path = deps/BBHash
-	url = https://github.com/rizkg/BBHash.git
+	url = https://github.com/vgteam/BBHash.git
diff --git a/deps/BBHash b/deps/BBHash
index 88fba4e5014..36e4fe3eaee 160000
--- a/deps/BBHash
+++ b/deps/BBHash
@@ -1 +1 @@
-Subproject commit 88fba4e50149d2d05855df0994f668b0f82783f7
+Subproject commit 36e4fe3eaeef762c831c49cdc01f1a3a2c7a97a4

From 1f09169a0b6c4baf8ad8301f05cf42e08984daec Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Tue, 3 Dec 2019 14:52:28 +0100
Subject: [PATCH 73/79] implement BED-based kmer extraction in vg find

---
 src/subcommand/find_main.cpp | 41 ++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp
index 0cdcc690589..a9bacfc4b15 100644
--- a/src/subcommand/find_main.cpp
+++ b/src/subcommand/find_main.cpp
@@ -9,6 +9,7 @@
 #include "../stream_index.hpp"
 #include "../algorithms/sorted_id_ranges.hpp"
 #include "../algorithms/approx_path_distance.hpp"
+#include "../kmer.hpp"
 #include <bdsg/overlay_helper.hpp>
 
 #include <unistd.h>
@@ -39,6 +40,7 @@ void help_find(char** argv) {
          << "    -E, --path-dag         with -p or -R, gets any node in the partial order from pos1 to pos2, assumes id sorted DAG" << endl
          << "    -W, --save-to PREFIX   instead of writing target subgraphs to stdout," << endl
          << "                           write one per given target to a separate file named PREFIX[path]:[start]-[end].vg" << endl
+         << "    -K, --subgraph-k K     instead of graphs, write kmers from the subgraphs" << endl
          << "alignments:" << endl
          << "    -d, --db-name DIR      use this RocksDB database to retrieve alignments" << endl
          << "    -l, --sorted-gam FILE  use this sorted, indexed GAM file" << endl
@@ -111,6 +113,7 @@ int main_find(int argc, char** argv) {
     bool path_dag = false;
     string bed_targets_file;
     string save_to_prefix;
+    int subgraph_k = 0;
 
     int c;
     optind = 2; // force optind past command positional argument
@@ -153,11 +156,12 @@ int main_find(int argc, char** argv) {
                 {"min-mem", required_argument, 0, 'Z'},
                 {"paths-named", required_argument, 0, 'Q'},
                 {"list-paths", no_argument, 0, 'I'},
+                {"subgraph-k", required_argument, 0, 'K'},
                 {0, 0, 0, 0}
             };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:l:amg:M:B:fDG:N:A:Y:Z:IQ:ER:W:",
+        c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:l:amg:M:B:fDG:N:A:Y:Z:IQ:ER:W:K:",
                          long_options, &option_index);
 
         // Detect the end of the options.
@@ -309,6 +313,10 @@ int main_find(int argc, char** argv) {
             to_graph_file = optarg;
             break;
 
+        case 'K':
+            subgraph_k = atoi(optarg);
+            break;
+
         case 'h':
         case '?':
             help_find(argv);
@@ -643,8 +651,37 @@ int main_find(int argc, char** argv) {
                     VG empty;
                     graph = empty;
                 }
+                if (subgraph_k) {
+                    // enumerate the kmers, calculating including their start positions relative to the reference
+                    // and write to stdout?
+                    for_each_kmer(graph, subgraph_k,
+                                  [&](const kmer_t& kmer) {
+                                      // get the reference-relative position
+                                      string start_str, end_str;
+                                      for (auto& p : algorithms::nearest_offsets_in_paths(xindex, kmer.begin, subgraph_k*2)) {
+                                          const uint64_t& start_p = p.second.front().first;
+                                          const bool& start_rev = p.second.front().second;
+                                          if (p.first == path_handle && (!start_rev && start_p >= target.start || start_rev && start_p <= target.end)) {
+                                              start_str = target.seq + ":" + std::to_string(start_p) + (p.second.front().second ? "-" : "+");
+                                          }
+                                      }
+                                      for (auto& p : algorithms::nearest_offsets_in_paths(xindex, kmer.end, subgraph_k*2)) {
+                                          const uint64_t& end_p = p.second.front().first;
+                                          const bool& end_rev = p.second.front().second;
+                                          if (p.first == path_handle && (!end_rev && end_p <= target.end || end_rev && end_p >= target.start)) {
+                                              end_str = target.seq + ":" + std::to_string(end_p) + (p.second.front().second ? "-" : "+");
+                                          }
+                                      }
+                                      if (!start_str.empty() && !end_str.empty()) {
+                                          // write our record
+#pragma omp critical (cout)
+                                          cout << target.seq << ":" << target.start << "-" << target.end << "\t"
+                                               << kmer.seq << "\t" << start_str << "\t" << end_str << std::endl;
+                                      }
+                                  });
+                }
             }
-            if (save_to_prefix.empty()) {
+            if (save_to_prefix.empty() && !subgraph_k) {
                 prep_graph();
                 graph.serialize_to_ostream(cout);
             }

From 48e7dca839bcb105923c61d1f8764a8f691ef939 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Tue, 3 Dec 2019 14:56:18 +0100
Subject: [PATCH 74/79] add a basic test for the new BED based kmer extraction

---
 test/t/05_vg_find.t | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t
index 3b0550a6c89..a7fc6d9dfbf 100644
--- a/test/t/05_vg_find.t
+++ b/test/t/05_vg_find.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 27
+plan tests 28
 
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg
 is $? 0 "construction"
@@ -107,6 +107,8 @@ is $((vg view t.x:30:35.vg; vg view t.x:10:20.vg) | wc -l) 20 "we can extract a
 echo x 30 36 | tr ' ' '\t' >t.bed
 echo x 10 21 | tr ' ' '\t' >>t.bed
 vg find -x t.xg -E -R t.bed -W q.
-is $((vg view q.x:10:20.vg; vg view q.x:30:35.vg) | md5sum | cut -f 1 -d\ ) $((vg view t.x:10:20.vg ; vg view t.x:30:35.vg)| md5sum | cut -f 1 -d\ ) "the same extraction can be made using BEd input"
+is $((vg view q.x:10:20.vg; vg view q.x:30:35.vg) | md5sum | cut -f 1 -d\ ) $((vg view t.x:10:20.vg ; vg view t.x:30:35.vg)| md5sum | cut -f 1 -d\ ) "the same extraction can be made using BED input"
+
+is $(vg find -x t.xg -E -p x:30-35 -p x:10-20 -K 5 | wc -l) 22 "we see the expected number of kmers in the given targets"
 
 rm -f t.xg t.vg t.x:30:35.vg t.x:10:20.vg q.x:30:35.vg q.x:10:20.vg t.bed

From d7bdb0d0014a52c7d5c93788ae79a497e623f294 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Tue, 3 Dec 2019 11:35:20 -0800
Subject: [PATCH 75/79] handlifyed vg rna

---
 src/io/register_loader_saver_hash_graph.cpp   |   2 +-
 src/io/register_loader_saver_odgi.cpp         |   2 +-
 src/io/register_loader_saver_packed_graph.cpp |   2 +-
 src/io/register_loader_saver_vg.cpp           |   2 +-
 src/path_index.cpp                            |  10 +-
 src/subcommand/rna_main.cpp                   |   8 +-
 src/transcriptome.cpp                         | 294 +++++++-----------
 src/transcriptome.hpp                         |  11 +-
 8 files changed, 124 insertions(+), 207 deletions(-)

diff --git a/src/io/register_loader_saver_hash_graph.cpp b/src/io/register_loader_saver_hash_graph.cpp
index f2d4b1fdc10..fb8db9ad538 100644
--- a/src/io/register_loader_saver_hash_graph.cpp
+++ b/src/io/register_loader_saver_hash_graph.cpp
@@ -17,7 +17,7 @@ using namespace std;
 using namespace vg::io;
 
 void register_loader_saver_hash_graph() {
-  Registry::register_bare_loader_saver<bdsg::HashGraph, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>("HashGraph", [](istream& input) -> void* {
+  Registry::register_bare_loader_saver<bdsg::HashGraph, MutablePathDeletableHandleGraph, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>("HashGraph", [](istream& input) -> void* {
         // Allocate a HashGraph
          bdsg::HashGraph* hash_graph = new bdsg::HashGraph();
         
diff --git a/src/io/register_loader_saver_odgi.cpp b/src/io/register_loader_saver_odgi.cpp
index 0bf21f22c86..97a0d3d1c6f 100644
--- a/src/io/register_loader_saver_odgi.cpp
+++ b/src/io/register_loader_saver_odgi.cpp
@@ -17,7 +17,7 @@ using namespace std;
 using namespace vg::io;
 
 void register_loader_saver_odgi() {
-    Registry::register_bare_loader_saver<bdsg::ODGI, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>("PackedGraph", [](istream& input) -> void* {
+    Registry::register_bare_loader_saver<bdsg::ODGI, MutablePathDeletableHandleGraph, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>("PackedGraph", [](istream& input) -> void* {
         // Allocate a PackedGraph
         bdsg::ODGI* odgi = new bdsg::ODGI();
         
diff --git a/src/io/register_loader_saver_packed_graph.cpp b/src/io/register_loader_saver_packed_graph.cpp
index 510ead17723..4a7baeac7ca 100644
--- a/src/io/register_loader_saver_packed_graph.cpp
+++ b/src/io/register_loader_saver_packed_graph.cpp
@@ -17,7 +17,7 @@ using namespace std;
 using namespace vg::io;
 
 void register_loader_saver_packed_graph() {
-  Registry::register_bare_loader_saver<bdsg::PackedGraph, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>("PackedGraph", [](istream& input) -> void* {
+  Registry::register_bare_loader_saver<bdsg::PackedGraph, MutablePathDeletableHandleGraph, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>("PackedGraph", [](istream& input) -> void* {
         // Allocate a PackedGraph
          bdsg::PackedGraph* packed_graph = new bdsg::PackedGraph();
         
diff --git a/src/io/register_loader_saver_vg.cpp b/src/io/register_loader_saver_vg.cpp
index a1554008df8..41630f5c2d2 100644
--- a/src/io/register_loader_saver_vg.cpp
+++ b/src/io/register_loader_saver_vg.cpp
@@ -17,7 +17,7 @@ using namespace vg::io;
 
 void register_loader_saver_vg() {
     // We register for "" so we can handle untagged old-style vg files and make them into HandleGraphs
-    Registry::register_loader_saver<VG, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>(vector<string>{"VG", ""},
+    Registry::register_loader_saver<VG, MutablePathDeletableHandleGraph, MutablePathMutableHandleGraph, MutableHandleGraph, PathHandleGraph, HandleGraph>(vector<string>{"VG", ""},
         [](const message_sender_function_t& for_each_message) -> void* {
         // We have a bit of a control problem.
         // The source function wants to drive; we give it a function of strings, and it calls it with all the strings in turn.
diff --git a/src/path_index.cpp b/src/path_index.cpp
index e5ed83e3ca6..8e215a22e4c 100644
--- a/src/path_index.cpp
+++ b/src/path_index.cpp
@@ -267,14 +267,8 @@ PathIndex::PathIndex(const PathHandleGraph& graph, const string& path_name, bool
     assert(graph.has_path(path_name));
     
     // Make a Protobuf path object
-    Path path;
-    for (handle_t handle : graph.scan_path(graph.get_path_handle(path_name))) {
-        Mapping* mapping = path.add_mapping();
-        Position* position = mapping->mutable_position();
-        position->set_node_id(graph.get_id(handle));
-        position->set_is_reverse(graph.get_is_reverse(handle));
-    }
-    
+    auto path = path_from_path_handle(graph, graph.get_path_handle(path_name));
+
     if (extract_sequence) {
         // Constructor dispatch hack
         *this = PathIndex(path, graph);
diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp
index 2a5c1d8153a..596d2bcb66b 100644
--- a/src/subcommand/rna_main.cpp
+++ b/src/subcommand/rna_main.cpp
@@ -28,7 +28,7 @@ void help_rna(char** argv) {
          << "    -l, --haplotypes FILE      project transcripts onto haplotypes in GBWT index file" << endl
          << "    -e, --use-embedded-paths   project transcripts onto embedded graph paths" << endl
          << "    -c, --do-not-collapse      do not collapse identical transcripts across haplotypes" << endl
-         << "    -d, --remove-non-gene      remove intergenic and intronic regions (removes reference paths if -a or -r)" << endl
+         << "    -d, --remove-non-gene      remove intergenic and intronic regions (removes reference paths)" << endl
          << "    -o, --do-not-sort          do not topological sort and compact splice graph" << endl
          << "    -r, --add-ref-paths        add reference transcripts as embedded paths in the splice graph" << endl
          << "    -a, --add-non-ref-paths    add non-reference transcripts as embedded paths in the splice graph" << endl
@@ -201,6 +201,10 @@ int32_t main_rna(int32_t argc, char** argv) {
         return 1;       
     }
 
+    if (remove_non_transcribed && !add_reference_transcript_paths && !add_non_reference_transcript_paths) {
+
+        cerr << "[vg rna] WARNING: No haplotypes or paths were given for transcript projection. Use --haplotypes FILE and/or --use-embeded-paths." << endl;
+    }
 
     double time_parsing_start = gcsa::readTimer();
     if (show_progress) { cerr << "[vg rna] Parsing graph file ..." << endl; }
@@ -287,7 +291,7 @@ int32_t main_rna(int32_t argc, char** argv) {
             if (show_progress) { cerr << "[vg rna] Adding " << ((add_reference_transcript_paths) ? "reference" : "non-reference") << " transcript paths to splice graph ..." << endl; }
         }
 
-        transcriptome.embed_transcript_paths(add_reference_transcript_paths, add_non_reference_transcript_paths, false);
+        transcriptome.embed_transcript_paths(add_reference_transcript_paths, add_non_reference_transcript_paths);
 
         if (show_progress) { cerr << "[vg rna] Paths added in " << gcsa::readTimer() - time_add_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; };
     }
diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index 55854baa9ec..1553900890c 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -3,22 +3,25 @@
 
 #include "../algorithms/topological_sort.hpp"
 #include "../algorithms/apply_bulk_modifications.hpp"
+#include "../io/save_handle_graph.hpp"
 
 #include "transcriptome.hpp"
 #include "../gbwt_helper.hpp"
+#include "../augment.hpp"
+#include "../utility.hpp"
 
 namespace vg {
 
 using namespace std;
 
-// #define transcriptome_debug
+#define transcriptome_debug
 
 
 Transcriptome::Transcriptome(const string & graph_filename, const bool show_progress) {
 
     // Load variation graph.
     get_input_file(graph_filename, [&](istream& in) {
-        _splice_graph = new VG(in, show_progress);
+        _splice_graph = vg::io::VPKG::load_one<MutablePathDeletableHandleGraph>(in);
     });
 
     if (!_splice_graph) {
@@ -27,11 +30,6 @@ Transcriptome::Transcriptome(const string & graph_filename, const bool show_prog
     }
 }
 
-Transcriptome::~Transcriptome() {
-
-    delete _splice_graph;
-}
-
 void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBWT & haplotype_index) {
 
 #ifdef transcriptome_debug
@@ -42,7 +40,14 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
     vector<Transcript> transcripts;
 
     // Get mean length of nodes in the graph.
-    const float mean_node_length = _splice_graph->length() / static_cast<double>(_splice_graph->size());
+    double total_node_length = 0;
+    assert(_splice_graph->for_each_handle([&](const handle_t & handle) {
+
+        total_node_length += _splice_graph->get_length(handle);
+    }));
+
+    const float mean_node_length = total_node_length / _splice_graph->get_node_count();
+
     pair<string, PathIndex *> chrom_path_index("", nullptr);
 
     int32_t line_number = 0;
@@ -75,7 +80,7 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
             continue;
         }
 
-        if (!_splice_graph->paths.has_path(chrom)) {
+        if (!_splice_graph->has_path(chrom)) {
         
             cerr << "[transcriptome] ERROR: Chromomsome path \"" << chrom << "\" not found in graph (line " << line_number << ")." << endl;
             exit(1);
@@ -86,7 +91,7 @@ void Transcriptome::add_transcripts(istream & transcript_stream, const gbwt::GBW
             chrom_path_index.first = chrom;
 
             // Construct path index for chromosome/contig.
-            chrom_path_index.second = new PathIndex(*_splice_graph, chrom);
+            chrom_path_index.second = new PathIndex(*_splice_graph, chrom_path_index.first);
         }
 
         assert(chrom_path_index.second);
@@ -452,7 +457,7 @@ list<TranscriptPath> Transcriptome::project_transcript_gbwt(const Transcript & c
             for (auto & exon_node: haplotype.first.at(exon_idx)) {
 
                 auto node_id = gbwt::Node::id(exon_node);
-                auto node_length = _splice_graph->get_node(node_id)->sequence().size();
+                auto node_length = _splice_graph->get_length(_splice_graph->get_handle(node_id));
 
                 int32_t offset = 0;
 
@@ -511,7 +516,7 @@ list<TranscriptPath> Transcriptome::project_transcript_gbwt(const Transcript & c
         if (cur_transcript.is_reverse) {
 
             // Reverse complement transcript paths that are on the '-' strand.
-            reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_node(node_id)->sequence().size();});
+            reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_length(_splice_graph->get_handle(node_id));});
         }
 
         // Copy paths if collapse of identical transcript paths is not wanted.
@@ -635,38 +640,45 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
 
 list<TranscriptPath> Transcriptome::project_transcript_embedded(const Transcript & cur_transcript) const {
 
-    vector<map<int64_t, set<mapping_t*> > *> exon_start_node_mappings;
-    vector<map<int64_t, set<mapping_t*> > *> exon_end_node_mappings;
+    vector<unordered_map<path_handle_t, step_handle_t> > exon_start_node_path_steps;
+    vector<unordered_map<path_handle_t, step_handle_t> > exon_end_node_path_steps;
 
-    exon_start_node_mappings.reserve(cur_transcript.exon_border_nodes.size());
-    exon_end_node_mappings.reserve(cur_transcript.exon_border_nodes.size());
+    exon_start_node_path_steps.reserve(cur_transcript.exon_border_nodes.size());
+    exon_end_node_path_steps.reserve(cur_transcript.exon_border_nodes.size());
 
     // Get embedded path ids and node mappings for all exon border nodes in transcript.
     for (auto & exon_node: cur_transcript.exon_border_nodes) {
 
-        exon_start_node_mappings.emplace_back(&_splice_graph->paths.get_node_mapping(exon_node.first.node_id()));
-        exon_end_node_mappings.emplace_back(&_splice_graph->paths.get_node_mapping(exon_node.second.node_id()));
+        exon_start_node_path_steps.emplace_back(unordered_map<path_handle_t, step_handle_t>());
+        _splice_graph->for_each_step_on_handle(_splice_graph->get_handle(exon_node.first.node_id()), [&](const step_handle_t & step) {
+            assert(exon_start_node_path_steps.back().emplace(_splice_graph->get_path_handle_of_step(step), step).second);
+        });
+
+        exon_end_node_path_steps.emplace_back(unordered_map<path_handle_t, step_handle_t>());
+        _splice_graph->for_each_step_on_handle(_splice_graph->get_handle(exon_node.second.node_id()), [&](const step_handle_t & step) {
+            assert(exon_end_node_path_steps.back().emplace(_splice_graph->get_path_handle_of_step(step), step).second);
+        });
     }
 
     list<TranscriptPath> cur_transcript_paths;
 
     // Loop over all paths that contain the transcript start node.
-    for (auto & path_mapping_start: *exon_start_node_mappings.front()) {
+    for (auto & path_steps_start: exon_start_node_path_steps.front()) {
 
         // Skip path if transcript end node is not in the current path.
-        if (exon_end_node_mappings.back()->find(path_mapping_start.first) == exon_end_node_mappings.back()->end()) {
+        if (exon_end_node_path_steps.back().find(path_steps_start.first) == exon_end_node_path_steps.back().end()) {
 
             continue;
         }
 
+        const auto path_origin_name = _splice_graph->get_path_name(path_steps_start.first);
+
         // Skip alternative allele paths (_alt).
-        if (Paths::is_alt(_splice_graph->paths.get_path_name(path_mapping_start.first))) {
+        if (Paths::is_alt(path_origin_name)) {
 
             continue;
         }
 
-        const auto path_origin_name = _splice_graph->paths.get_path_name(path_mapping_start.first);
-
         // Only construct transcript paths originating from a reference chromosome/contig.
         if (path_origin_name != cur_transcript.chrom && !use_embedded_paths && use_reference_paths) {
 
@@ -688,59 +700,43 @@ list<TranscriptPath> Transcriptome::project_transcript_embedded(const Transcript
 
         bool is_partial = false;
 
-        mapping_t * haplotype_path_start_map = nullptr;
-        mapping_t * haplotype_path_end_map = nullptr;
-
-        for (size_t exon_idx = 0; exon_idx < exon_start_node_mappings.size(); ++exon_idx) {
-
-            auto haplotype_path_start_it = exon_start_node_mappings.at(exon_idx)->find(path_mapping_start.first);
-            auto haplotype_path_end_it = exon_end_node_mappings.at(exon_idx)->find(path_mapping_start.first);
+        for (size_t exon_idx = 0; exon_idx < exon_start_node_path_steps.size(); ++exon_idx) {
 
-            // Get path mapping at exon start if exon start node is in the current path.
-            if (haplotype_path_start_it != exon_start_node_mappings.at(exon_idx)->end()) {
-
-                assert(haplotype_path_start_it->second.size() == 1);
-                haplotype_path_start_map = *haplotype_path_start_it->second.begin();
-            }
-
-            // Get path mapping at exon end if exon end node is in the current path.
-            if (haplotype_path_end_it != exon_end_node_mappings.at(exon_idx)->end()) {
-
-                assert(haplotype_path_end_it->second.size() == 1);
-                haplotype_path_end_map = *haplotype_path_end_it->second.begin();
-            }
+            auto haplotype_path_start_it = exon_start_node_path_steps.at(exon_idx).find(path_steps_start.first);
+            auto haplotype_path_end_it = exon_end_node_path_steps.at(exon_idx).find(path_steps_start.first);
 
             // Transcript paths are partial if either the start or end exon path 
-            // mapping is empty. Partial transcripts are currently not supported.
+            // step is empty. Partial transcripts are currently not supported.
             // TODO: Add support for partial transcript paths.
-            if (!haplotype_path_start_map || !haplotype_path_end_map) {
+            if ((haplotype_path_start_it == exon_start_node_path_steps.at(exon_idx).end()) || haplotype_path_end_it == exon_end_node_path_steps.at(exon_idx).end()) {
 
                 is_partial = true;
                 break;
             }
 
-            bool is_first_mapping = true;
+            // Get path step at exon start if exon start node is in the current path.
+            auto haplotype_path_start_step = haplotype_path_start_it->second;
 
-            while (true) {
+            // Get path mapping at exon end if exon end node is in the current path.
+            auto haplotype_path_end_step = haplotype_path_end_it->second;
 
-                auto cur_node_id = haplotype_path_start_map->node_id();
-                auto node_length = _splice_graph->get_node(cur_node_id)->sequence().size();
-                assert(node_length == haplotype_path_start_map->length);
+            bool is_first_step = true;
 
+            while (true) {
+
+                auto node_length = _splice_graph->get_length(_splice_graph->get_handle_of_step(haplotype_path_start_step));
                 int32_t offset = 0;
 
                 // Adjust start position from exon border (last position in upstream intron)
                 // to first position in exon.
-                if (is_first_mapping) {
+                if (is_first_step) {
 
                     if (cur_transcript.exon_border_nodes.at(exon_idx).first.offset() + 1 == node_length) {
 
-                        assert(haplotype_path_start_map != haplotype_path_end_map);
-
-                        haplotype_path_start_map = _splice_graph->paths.traverse_right(haplotype_path_start_map);
-                        assert(haplotype_path_start_map);
+                        assert(haplotype_path_start_step != haplotype_path_end_step);
+                        haplotype_path_start_step = _splice_graph->get_next_step(haplotype_path_start_step);
+                        is_first_step = false;
 
-                        is_first_mapping = false;
                         continue;
                     
                     } else {
@@ -753,7 +749,7 @@ list<TranscriptPath> Transcriptome::project_transcript_embedded(const Transcript
 
                 // Adjust end position from exon border (first position in downstream intron)
                 // to last position in exon.
-                if (haplotype_path_start_map == haplotype_path_end_map) {
+                if (haplotype_path_start_step == haplotype_path_end_step) {
 
                     if (cur_transcript.exon_border_nodes.at(exon_idx).second.offset() == 0) {
 
@@ -773,7 +769,7 @@ list<TranscriptPath> Transcriptome::project_transcript_embedded(const Transcript
                 auto new_mapping = cur_transcript_paths.back().path.add_mapping();
                 new_mapping->set_rank(cur_transcript_paths.back().path.mapping_size());
 
-                new_mapping->mutable_position()->set_node_id(cur_node_id);
+                new_mapping->mutable_position()->set_node_id(_splice_graph->get_id(_splice_graph->get_handle_of_step(haplotype_path_start_step)));
                 new_mapping->mutable_position()->set_offset(offset);
                 new_mapping->mutable_position()->set_is_reverse(false);
 
@@ -782,16 +778,11 @@ list<TranscriptPath> Transcriptome::project_transcript_embedded(const Transcript
                 new_edit->set_from_length(edit_length);
                 new_edit->set_to_length(edit_length);
                 
-                if (haplotype_path_start_map == haplotype_path_end_map) { break; }
+                if (haplotype_path_start_step == haplotype_path_end_step) { break; }
 
-                haplotype_path_start_map = _splice_graph->paths.traverse_right(haplotype_path_start_map);
-                assert(haplotype_path_start_map);
-                
-                is_first_mapping = false;
+                haplotype_path_start_step = _splice_graph->get_next_step(haplotype_path_start_step);
+                is_first_step = false;
             }
-
-            haplotype_path_start_map = nullptr;
-            haplotype_path_end_map = nullptr;
         }
 
         if (is_partial) {
@@ -806,7 +797,7 @@ list<TranscriptPath> Transcriptome::project_transcript_embedded(const Transcript
             // Reverse complement transcript paths that are on the '-' strand.
             if (cur_transcript.is_reverse) {
 
-                reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_node(node_id)->sequence().size();});
+                reverse_complement_path_in_place(&(cur_transcript_paths.back().path), [&](size_t node_id) {return _splice_graph->get_length(_splice_graph->get_handle(node_id));});
             } 
         }  
     } 
@@ -871,7 +862,7 @@ bool Transcriptome::add_novel_transcript_junctions(const list<TranscriptPath> &
 
             auto & cur_mapping = transcript_path.path.mapping(i);
 
-            if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || !_splice_graph->has_node(cur_mapping.position().node_id()) || _splice_graph->get_node(cur_mapping.position().node_id())->sequence().size() != cur_mapping.edit(0).from_length()) {
+            if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || !_splice_graph->has_node(cur_mapping.position().node_id()) || _splice_graph->get_length(_splice_graph->get_handle(cur_mapping.position().node_id())) != cur_mapping.edit(0).from_length()) {
 
                 all_junctions_added = false;
                 i++;
@@ -882,12 +873,12 @@ bool Transcriptome::add_novel_transcript_junctions(const list<TranscriptPath> &
 
                 auto & prev_mapping = transcript_path.path.mapping(i - 1);
 
-                auto prev_node_side = NodeSide(prev_mapping.position().node_id(), (prev_mapping.position().is_reverse() ? false : true));
-                auto cur_node_side = NodeSide(cur_mapping.position().node_id(), (cur_mapping.position().is_reverse() ? true : false));
+                auto prev_handle = _splice_graph->get_handle(prev_mapping.position().node_id(), prev_mapping.position().is_reverse());
+                auto cur_handle = _splice_graph->get_handle(cur_mapping.position().node_id(), cur_mapping.position().is_reverse());
 
-                if (!_splice_graph->has_edge(prev_node_side, cur_node_side)) {
+                if (!_splice_graph->has_edge(prev_handle, cur_handle)) {
 
-                    _splice_graph->create_edge(prev_node_side, cur_node_side);
+                    _splice_graph->create_edge(prev_handle, cur_handle);
                 }
             }
         }
@@ -930,9 +921,11 @@ void Transcriptome::add_paths_to_transcriptome(list<TranscriptPath> * new_transc
     cerr << "DEBUG edit start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl;
 #endif
 
+        stringstream gam_out_stream;
+
         // Edit splice graph with projected transcript paths and
         // update path traversals to match the augmented graph. 
-        _splice_graph->edit(edit_paths, nullptr, false, true, true);
+        augment(static_cast<MutablePathMutableHandleGraph *>(_splice_graph.get()), edit_paths, nullptr, &gam_out_stream, false, true);
 
 #ifdef transcriptome_debug
     double time_edit_2 = gcsa::readTimer();
@@ -940,20 +933,15 @@ void Transcriptome::add_paths_to_transcriptome(list<TranscriptPath> * new_transc
 #endif
 
         // Update projected transcript paths with new path traversals. 
-        assert(edit_paths.size() == new_transcript_paths->size());
-
         auto new_transcript_paths_it = new_transcript_paths->begin();
-        auto edit_paths_it = edit_paths.begin();
-
-        while (new_transcript_paths_it != new_transcript_paths->end()) {
-
-            new_transcript_paths_it->path = move(*edit_paths_it);
+        
+        vg::io::for_each<vg::Alignment>(gam_out_stream, [&](vg::Alignment & alignment) {
 
+            new_transcript_paths_it->path = move(alignment.path());
             ++new_transcript_paths_it;
-            ++edit_paths_it;
-        } 
+        });
 
-        assert(edit_paths_it == edit_paths.end());
+        assert(new_transcript_paths_it == new_transcript_paths->end());
     }
 
     _transcript_paths.reserve(_transcript_paths.size() + new_transcript_paths->size());
@@ -975,26 +963,21 @@ int32_t Transcriptome::size() const {
     return _transcript_paths.size();
 }
 
-const VG & Transcriptome::splice_graph() const {
+const MutablePathDeletableHandleGraph & Transcriptome::splice_graph() const {
 
     return *_splice_graph;
 }
 
 void Transcriptome::remove_non_transcribed(const bool new_reference_paths) {
 
-    // Save copy of embedded reference paths
-    Paths reference_paths;
-    if (new_reference_paths) {
-
-        reference_paths = _splice_graph->paths;
-    }
-
     // Remove non transcript paths.
-    _splice_graph->clear_paths();
+    assert(_splice_graph->for_each_path_handle([&](const path_handle_t & path_handle) {
+
+        _splice_graph->destroy_path(path_handle);
+    }));
 
-    // Find all nodes and edges that are in a transcript path.
+    // Find all nodes that are in a transcript path.
     unordered_set<vg::id_t> transcribed_nodes;
-    unordered_set<pair<vg::id_t, vg::id_t> > transcribed_edges;
 
     for (auto & transcript_path: _transcript_paths) {
 
@@ -1004,125 +987,61 @@ void Transcriptome::remove_non_transcribed(const bool new_reference_paths) {
         for (size_t i = 1; i < transcript_path.path.mapping_size(); i++) {
 
             transcribed_nodes.emplace(transcript_path.path.mapping(i).position().node_id());
-            transcribed_edges.emplace(transcript_path.path.mapping(i-1).position().node_id(), transcript_path.path.mapping(i).position().node_id());
         }    
     }
 
-    // Find all nodes that are not in a transcript path.
+    // Delete all nodes that are not in a transcript path.
     vector<vg::id_t> non_transcribed_nodes;
-    _splice_graph->for_each_node([&](const Node * node) {
+    assert(_splice_graph->for_each_handle([&](const handle_t & handle) {
         
-        if (transcribed_nodes.count(node->id()) == 0) {
+        if (transcribed_nodes.count(_splice_graph->get_id(handle)) == 0) {
 
-             non_transcribed_nodes.emplace_back(node->id());   
+            _splice_graph->destroy_handle(handle);
         }
-    });
-
-    for (auto & node: non_transcribed_nodes) {
-
-        // Delete node and in/out edges. 
-        _splice_graph->destroy_node(node);
-    }
-
-    // Create new reference paths that only include trancribed nodes and edges.
-    if (new_reference_paths) {
-
-        reference_paths.for_each([&](const Path & path) {
-
-            if (!Paths::is_alt(path.name())) {
-
-                vector<Path> new_paths; 
-
-                new_paths.emplace_back(Path());
-                new_paths.back().set_name(path.name() + "_" + to_string(new_paths.size() - 1));
-
-                for (auto & mapping: path.mapping()) {
-
-                    auto cur_node_id = mapping.position().node_id();
-
-                    if (new_paths.back().mapping_size() == 0) {
-
-                        if (transcribed_nodes.count(cur_node_id) > 0) {
-
-                            auto new_mapping = new_paths.back().add_mapping();
-                            *new_mapping = mapping;
-                            new_mapping->set_rank(new_paths.back().mapping_size()); 
-                        }               
-                    
-                    } else {
-
-                        auto prev_node_id = new_paths.back().mapping(new_paths.back().mapping_size() - 1).position().node_id();
-
-                        // Extend new path, if transcribed edge (forward or reverse) exist between 
-                        // this and the previous node in the path.
-                        if (transcribed_edges.count(make_pair(prev_node_id, cur_node_id)) > 0 || transcribed_edges.count(make_pair(cur_node_id, prev_node_id)) > 0) {
-
-                            auto new_mapping = new_paths.back().add_mapping();
-                            *new_mapping = mapping;
-                            new_mapping->set_rank(new_paths.back().mapping_size());   
-
-                        } else {
-
-                            new_paths.emplace_back(Path());
-                            new_paths.back().set_name(path.name() + "_" + to_string(new_paths.size() - 1));
-                        }
-                    }
-                }
-
-                // Add new reference paths to graph without rebuild paths indexes.
-                _splice_graph->paths.extend(new_paths, false, false); 
-            }
-        });
-
-        // Rebuild paths indexes.
-        _splice_graph->paths.compact_ranks();
-    }
+    }));
 }
 
 void Transcriptome::compact_ordered() {
 
     // Find and apply topological ordering 
-    auto topological_ordering = algorithms::topological_order(_splice_graph);
-    _splice_graph->apply_ordering(topological_ordering);
+    _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), false);
 
-    // Compact node ids and update embedded paths.
-    hash_map<id_t, id_t> compacted_nodes;
-    _splice_graph->compact_ids(compacted_nodes);
+    // TODO: Compact nodes for other graph types
+    VG * vg_splice_graph = dynamic_cast<VG *>(_splice_graph.get());
+    if (vg_splice_graph != nullptr) {
 
-    // Update transcript paths with compacted node ids
-    for (auto & transcript_path: _transcript_paths) {
-    
-        for (auto & mapping: *transcript_path.path.mutable_mapping()) {
+        // Compact node ids and update embedded paths
+        hash_map<id_t, id_t> compacted_nodes;
+        vg_splice_graph->compact_ids(compacted_nodes);
 
-            mapping.mutable_position()->set_node_id(compacted_nodes.at(mapping.position().node_id()));
+        // Update transcript paths with compacted node ids
+        for (auto & transcript_path: _transcript_paths) {
+        
+            for (auto & mapping: *transcript_path.path.mutable_mapping()) {
+
+                mapping.mutable_position()->set_node_id(compacted_nodes.at(mapping.position().node_id()));
+            }
         }
     }
 }
 
-void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths, const bool rebuild_indexes) {
+void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths) {
 
     // Add transcript paths to graph
     for (auto & transcript_path: _transcript_paths) {
 
         assert(!transcript_path.haplotype_origins.empty() || !transcript_path.reference_origin.empty());
-        transcript_path.path.set_name(transcript_path.name);
+        auto path_handle = _splice_graph->create_path_handle(transcript_path.name);
 
-        if (add_reference_paths && !transcript_path.reference_origin.empty()) {
-
-            _splice_graph->paths.extend(transcript_path.path, false, false);
-        
-        } else if (add_non_reference_paths && !transcript_path.haplotype_origins.empty()) {
+        if ((add_reference_paths && !transcript_path.reference_origin.empty()) || (add_non_reference_paths && !transcript_path.haplotype_origins.empty())) {
 
-            _splice_graph->paths.extend(transcript_path.path, false, false);            
-        }   
+            for (auto & mapping: transcript_path.path.mapping()) {
 
-        transcript_path.path.set_name("");
-    }
-
-    // Rebuild paths indexes.
-    if (rebuild_indexes) {
+                auto handle = _splice_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse());
+                _splice_graph->append_step(path_handle, handle);
+            }
 
-        _splice_graph->paths.compact_ranks();
+        }
     }
 }
 
@@ -1187,8 +1106,7 @@ void Transcriptome::write_sequences(ostream * fasta_ostream, const bool output_r
         if (!transcript_path.haplotype_origins.empty() || output_reference_transcripts) {
 
             // Write transcript path name and sequence.
-            *fasta_ostream << ">" << transcript_path.name << endl;
-            *fasta_ostream << _splice_graph->path_sequence(transcript_path.path) << endl;
+            write_fasta_sequence(transcript_path.name, path_sequence(*_splice_graph, transcript_path.path), cout);
         }
     }
 }
@@ -1240,7 +1158,7 @@ void Transcriptome::write_info(ostream * tsv_ostream, const bool output_referenc
 
 void Transcriptome::write_splice_graph(ostream * vg_ostream) {
 
-    _splice_graph->serialize_to_ostream(*vg_ostream);
+    vg::io::save_handle_graph(_splice_graph.get(), *vg_ostream);
 }
     
 }
diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp
index 8d4a749f5be..f9fa8effa77 100644
--- a/src/transcriptome.hpp
+++ b/src/transcriptome.hpp
@@ -7,6 +7,9 @@
 
 #include <google/protobuf/util/message_differencer.h>
 #include <gbwt/dynamic_gbwt.h>
+#include <vg/io/stream.hpp>
+#include <vg/io/vpkg.hpp>
+#include <handlegraph/mutable_path_mutable_handle_graph.hpp>
 
 #include "../vg.hpp"
 #include "../path_index.hpp"
@@ -82,7 +85,6 @@ class Transcriptome {
     public:
 
         Transcriptome(const string &, const bool);   
-        ~Transcriptome();
 
         /// Number of threads used for transcript path construction. 
         int32_t num_threads = 1;
@@ -113,7 +115,7 @@ class Transcriptome {
         int32_t size() const;
 
         /// Returns spliced variation graph.
-        const VG & splice_graph() const; 
+        const MutablePathDeletableHandleGraph & splice_graph() const; 
 
         /// Removes non-transcribed (not in transcript paths) nodes.
         /// Optionally create new reference paths that only include
@@ -124,8 +126,7 @@ class Transcriptome {
         void compact_ordered();
 
         /// Embeds transcript paths in spliced variation graph. 
-        /// Optionally rebuild paths indexes.
-        void embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths, const bool rebuild_indexes);
+        void embed_transcript_paths(const bool add_reference_paths, const bool add_non_reference_paths);
 
         /// Add transcript paths as threads in GBWT index.
         void construct_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool output_reference_transcripts, const bool add_bidirectional) const;
@@ -148,7 +149,7 @@ class Transcriptome {
         vector<TranscriptPath> _transcript_paths;
 
         /// Spliced variation graph.
-        VG * _splice_graph;
+        unique_ptr<MutablePathDeletableHandleGraph> _splice_graph;
 
         /// Finds the position of each end of a exon on a path in the  
         /// variation graph and adds the exon to a transcript.

From 930e335a5179cc64705c0303cf2e7706496508c6 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Tue, 3 Dec 2019 13:48:18 -0800
Subject: [PATCH 76/79] fix warning

---
 src/subcommand/rna_main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp
index 596d2bcb66b..529db98ea7e 100644
--- a/src/subcommand/rna_main.cpp
+++ b/src/subcommand/rna_main.cpp
@@ -28,7 +28,7 @@ void help_rna(char** argv) {
          << "    -l, --haplotypes FILE      project transcripts onto haplotypes in GBWT index file" << endl
          << "    -e, --use-embedded-paths   project transcripts onto embedded graph paths" << endl
          << "    -c, --do-not-collapse      do not collapse identical transcripts across haplotypes" << endl
-         << "    -d, --remove-non-gene      remove intergenic and intronic regions (removes reference paths)" << endl
+         << "    -d, --remove-non-gene      remove intergenic and intronic regions (deletes reference paths)" << endl
          << "    -o, --do-not-sort          do not topological sort and compact splice graph" << endl
          << "    -r, --add-ref-paths        add reference transcripts as embedded paths in the splice graph" << endl
          << "    -a, --add-non-ref-paths    add non-reference transcripts as embedded paths in the splice graph" << endl
@@ -203,7 +203,7 @@ int32_t main_rna(int32_t argc, char** argv) {
 
     if (remove_non_transcribed && !add_reference_transcript_paths && !add_non_reference_transcript_paths) {
 
-        cerr << "[vg rna] WARNING: No haplotypes or paths were given for transcript projection. Use --haplotypes FILE and/or --use-embeded-paths." << endl;
+        cerr << "[vg rna] WARNING: Reference paths are deleted when removing intergenic and intronic regions. Consider adding transcripts as embedded paths using --add-ref-paths and/or --add-non-ref-paths." << endl;
     }
 
     double time_parsing_start = gcsa::readTimer();

From 689a03b24da6352200d938232f4d3bf995390b31 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Tue, 3 Dec 2019 15:14:21 -0800
Subject: [PATCH 77/79] compact ids for non-vg graphs

---
 src/transcriptome.cpp | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index 1553900890c..9258419f80c 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -1003,13 +1003,13 @@ void Transcriptome::remove_non_transcribed(const bool new_reference_paths) {
 
 void Transcriptome::compact_ordered() {
 
-    // Find and apply topological ordering 
-    _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), false);
-
-    // TODO: Compact nodes for other graph types
     VG * vg_splice_graph = dynamic_cast<VG *>(_splice_graph.get());
+
     if (vg_splice_graph != nullptr) {
 
+        // Find and apply topological ordering 
+        _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), false);
+
         // Compact node ids and update embedded paths
         hash_map<id_t, id_t> compacted_nodes;
         vg_splice_graph->compact_ids(compacted_nodes);
@@ -1022,6 +1022,21 @@ void Transcriptome::compact_ordered() {
                 mapping.mutable_position()->set_node_id(compacted_nodes.at(mapping.position().node_id()));
             }
         }
+    
+    } else {
+
+        // Add transcript paths to graph in order to compact ids of non-vg graphs. 
+        // TODO: Find better solution.
+        embed_transcript_paths(true, true);
+        _splice_graph->apply_ordering(algorithms::topological_order(_splice_graph.get()), true);
+
+        for (auto & transcript_path: _transcript_paths) {
+
+            auto path_handle = _splice_graph->get_path_handle(transcript_path.name);
+            transcript_path.path = path_from_path_handle(*_splice_graph, path_handle);
+
+            _splice_graph->destroy_path(path_handle);
+        }
     }
 }
 
@@ -1106,7 +1121,7 @@ void Transcriptome::write_sequences(ostream * fasta_ostream, const bool output_r
         if (!transcript_path.haplotype_origins.empty() || output_reference_transcripts) {
 
             // Write transcript path name and sequence.
-            write_fasta_sequence(transcript_path.name, path_sequence(*_splice_graph, transcript_path.path), cout);
+            write_fasta_sequence(transcript_path.name, path_sequence(*_splice_graph, transcript_path.path), *fasta_ostream);
         }
     }
 }

From 6b463db4e1d3683ffaca2254b932112751d66bf4 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Tue, 3 Dec 2019 17:04:51 -0800
Subject: [PATCH 78/79] fix destroy_handle handle issue

---
 src/transcriptome.cpp | 52 +++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index 9258419f80c..cb303be1b56 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -862,14 +862,12 @@ bool Transcriptome::add_novel_transcript_junctions(const list<TranscriptPath> &
 
             auto & cur_mapping = transcript_path.path.mapping(i);
 
-            if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || !_splice_graph->has_node(cur_mapping.position().node_id()) || _splice_graph->get_length(_splice_graph->get_handle(cur_mapping.position().node_id())) != cur_mapping.edit(0).from_length()) {
+            if (cur_mapping.position().offset() > 0 || cur_mapping.edit_size() > 1 || !edit_is_match(cur_mapping.edit(0)) || _splice_graph->get_length(_splice_graph->get_handle(cur_mapping.position().node_id())) != cur_mapping.edit(0).from_length()) {
 
                 all_junctions_added = false;
                 i++;
-                continue;
-            } 
-
-            if (i > 0) {
+            
+            } else if (i > 0) {
 
                 auto & prev_mapping = transcript_path.path.mapping(i - 1);
 
@@ -970,11 +968,21 @@ const MutablePathDeletableHandleGraph & Transcriptome::splice_graph() const {
 
 void Transcriptome::remove_non_transcribed(const bool new_reference_paths) {
 
-    // Remove non transcript paths.
+    vector<path_handle_t> path_handles;
+    path_handles.reserve(_splice_graph->get_path_count());
+
     assert(_splice_graph->for_each_path_handle([&](const path_handle_t & path_handle) {
 
+        path_handles.emplace_back(path_handle);
+    }));    
+
+    // Remove non transcript paths.
+    for (auto & path_handle: path_handles) {
+
         _splice_graph->destroy_path(path_handle);
-    }));
+    }
+
+    assert(_splice_graph->get_path_count() == 0);
 
     // Find all nodes that are in a transcript path.
     unordered_set<vg::id_t> transcribed_nodes;
@@ -982,23 +990,31 @@ void Transcriptome::remove_non_transcribed(const bool new_reference_paths) {
     for (auto & transcript_path: _transcript_paths) {
 
         assert(transcript_path.path.mapping_size() > 0);
-        transcribed_nodes.emplace(transcript_path.path.mapping(0).position().node_id());
-
-        for (size_t i = 1; i < transcript_path.path.mapping_size(); i++) {
+        for (auto & mapping: transcript_path.path.mapping()) {
 
-            transcribed_nodes.emplace(transcript_path.path.mapping(i).position().node_id());
+            transcribed_nodes.emplace(mapping.position().node_id());
         }    
     }
 
-    // Delete all nodes that are not in a transcript path.
-    vector<vg::id_t> non_transcribed_nodes;
+    vector<handle_t> non_transcribed_handles;
+    non_transcribed_handles.reserve(_splice_graph->get_node_count() - transcribed_nodes.size());
+
+    // Collect all nodes that are not in a transcript path.
     assert(_splice_graph->for_each_handle([&](const handle_t & handle) {
         
         if (transcribed_nodes.count(_splice_graph->get_id(handle)) == 0) {
 
-            _splice_graph->destroy_handle(handle);
-        }
+            non_transcribed_handles.emplace_back(handle); 
+        } 
     }));
+
+    for (auto & handle: non_transcribed_handles) {
+
+        // Delete node and in/out edges. 
+        _splice_graph->destroy_handle(handle);
+    }
+
+    assert(_splice_graph->get_node_count() == transcribed_nodes.size());
 }
 
 void Transcriptome::compact_ordered() {
@@ -1046,16 +1062,14 @@ void Transcriptome::embed_transcript_paths(const bool add_reference_paths, const
     for (auto & transcript_path: _transcript_paths) {
 
         assert(!transcript_path.haplotype_origins.empty() || !transcript_path.reference_origin.empty());
-        auto path_handle = _splice_graph->create_path_handle(transcript_path.name);
 
         if ((add_reference_paths && !transcript_path.reference_origin.empty()) || (add_non_reference_paths && !transcript_path.haplotype_origins.empty())) {
 
+            auto path_handle = _splice_graph->create_path_handle(transcript_path.name);
             for (auto & mapping: transcript_path.path.mapping()) {
 
-                auto handle = _splice_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse());
-                _splice_graph->append_step(path_handle, handle);
+                _splice_graph->append_step(path_handle, _splice_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse()));
             }
-
         }
     }
 }

From 9ee2c4c4c858af8009bac2d87ef24643b4ae92c4 Mon Sep 17 00:00:00 2001
From: jonassibbesen <j.a.sibbesen@gmail.com>
Date: Tue, 3 Dec 2019 17:05:07 -0800
Subject: [PATCH 79/79] remove debug

---
 src/transcriptome.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index cb303be1b56..0db304b6cfc 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -14,7 +14,7 @@ namespace vg {
 
 using namespace std;
 
-#define transcriptome_debug
+// #define transcriptome_debug
 
 
 Transcriptome::Transcriptome(const string & graph_filename, const bool show_progress) {