From ba527f107db790232c4a565e0d07bc56040c1666 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 26 Jul 2024 11:14:55 -0400
Subject: [PATCH 1/4] skip spanning subtree step for star trees

---
 src/cactus/progressive/multiCactusTree.py       | 17 +++++++++++------
 .../progressive/progressive_decomposition.py    |  4 ++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/cactus/progressive/multiCactusTree.py b/src/cactus/progressive/multiCactusTree.py
index ce9541fb2..37f81a4c2 100644
--- a/src/cactus/progressive/multiCactusTree.py
+++ b/src/cactus/progressive/multiCactusTree.py
@@ -87,14 +87,19 @@ def traverseSubtree(self, root, node):
                     yield i
 
     # Extracts a tree spanning the nodes with the given names.
-    def extractSpanningTree(self, nodes):
+    def extractSpanningTree(self, nodes, root_id=None):
         assert len(nodes) > 1
         nodeIds = [self.nameToId[name] for name in nodes]
-        paths = [dijkstra_path(self.nxDg.to_undirected(), source=nodeIds[0], target=x) for x in nodeIds[1:]]
-        nodesToInclude = set()
-        for path in paths:
-            for node in path:
-                nodesToInclude.add(node)
+        if root_id is not None and all([self.getParent(n) == root_id for n in nodeIds]):
+            # this is a simple star tree, don't need path-finding logic below which doesn't scale to giant trees
+            nodesToInclude = nodeIds + [root_id]
+        else:
+            sourceId = root_id if root_id is not None else nodeIds[0]
+            paths = [dijkstra_path(self.nxDg.to_undirected(), source=sourceId, target=x) for x in nodeIds[1:]]
+            nodesToInclude = set()
+            for path in paths:
+                for node in path:
+                    nodesToInclude.add(node)
 
         cpy = self.nxDg.subgraph(nodesToInclude).copy()
         # Get rid of nodes that have only 1 children
diff --git a/src/cactus/progressive/progressive_decomposition.py b/src/cactus/progressive/progressive_decomposition.py
index 374d188f2..387298022 100644
--- a/src/cactus/progressive/progressive_decomposition.py
+++ b/src/cactus/progressive/progressive_decomposition.py
@@ -31,7 +31,7 @@
 from cactus.progressive.seqFile import SeqFile
 from sonLib.nxnewick import NXNewick
 from toil.statsAndLogging import logger
-
+from toil.realtimeLogger import RealtimeLogger
 
 def parse_seqfile(seqfile_path, config_wrapper, root_name = None, pangenome = False):
     """
@@ -201,7 +201,7 @@ def get_spanning_subtree(mc_tree, root_name, config_wrapper, outgroup_map):
         node_id_set.add(node_id)
 
     # get the spanning tree
-    spanning_tree = mc_tree.extractSpanningTree([mc_tree.getName(node) for node in node_id_set])
+    spanning_tree = mc_tree.extractSpanningTree([mc_tree.getName(node) for node in node_id_set], root_id = root_id)
 
     spanning_tree.computeSubtreeRoots()
     return spanning_tree

From e7b5c433b70429cd1c4b913ddc4209d5d74a6680 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 26 Jul 2024 14:06:52 -0400
Subject: [PATCH 2/4] fix convert command for graphaligner

---
 doc/sa_refgraph_hackathon_2023.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/sa_refgraph_hackathon_2023.md b/doc/sa_refgraph_hackathon_2023.md
index f9b5f6102..642334f23 100644
--- a/doc/sa_refgraph_hackathon_2023.md
+++ b/doc/sa_refgraph_hackathon_2023.md
@@ -472,11 +472,11 @@ Note: this section is adpated from [methods for the mc paper](https://github.com
 
 `vg giraffe` will soon be able to map long reads, but is not ready yet. For now, you should use [GraphAligner](https://github.com/maickrau/GraphAligner).
 
-In order for the resulting mapping to be compatible with the `.gbz`, we first convert the `.gbz` into a `.gfa`.  You can run `GraphAligner` on the `.gfa` that comes out of `cactus-pangenome`, but the [coordinates will be different](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md#node-chopping) from the `.gbz`.
+In order for the resulting mapping to be compatible with the `.gbz`, we first convert the `.gbz` into a `.gfa`.  You can run `GraphAligner` on the `.gfa` that comes out of `cactus-pangenome`, but the [coordinates will be different](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md#node-chopping) from the `.gbz` (note the `--vg-algorithm` flag is important for ensuring this).
 
 ```
 singularity exec -H $(pwd) docker://quay.io/comparative-genomics-toolkit/cactus:v2.6.13 \
-bash -c "vg convert ./hprc10/hprc10.gbz -f > ./hprc10/hprc10.gbz.gfa"
+bash -c "vg convert ./hprc10/hprc10.gbz -f --vg-algorithm > ./hprc10/hprc10.gbz.gfa"
 ```
 
 This takes 43 seconds and 10Gb RAM.

From f90e86b6ff9590579e9a1a4cecd2b77a5d8f2f61 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 26 Jul 2024 15:52:01 -0400
Subject: [PATCH 3/4] turn off abpoa progressive for >5k rows

---
 bar/impl/bar.c                           |  3 ++-
 bar/impl/poaBarAligner.c                 | 15 +++++++++------
 bar/inc/poaBarAligner.h                  | 11 +++++++----
 bar/tests/poaBarTest.c                   |  8 ++++----
 src/cactus/cactus_progressive_config.xml |  4 +++-
 5 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/bar/impl/bar.c b/bar/impl/bar.c
index 36ff8ef68..1bd9ed4b4 100644
--- a/bar/impl/bar.c
+++ b/bar/impl/bar.c
@@ -70,6 +70,7 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList *
     // Note that poa uses about N^2 memory, so maximum value is generally in 10s of kb
     int64_t poaWindow = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentWindow");
     int64_t maskFilter = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentMaskFilter");
+    int64_t poaMaxProgRows = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentProgressiveMaxRows");
     abpoa_para_t *poaParameters = usePoa ? abpoaParamaters_constructFromCactusParams(params) : NULL;
 
     //////////////////////////////////////////////
@@ -101,7 +102,7 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList *
              *
              * It does not use any precomputed alignments, if they are provided they will be ignored
              */
-            alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaParameters);
+            alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaMaxProgRows, poaParameters);
             st_logDebug("Created the poa alignments: %" PRIi64 " poa alignment blocks for flower\n", stList_length(alignments));
         } else {
             alignments = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength,
diff --git a/bar/impl/poaBarAligner.c b/bar/impl/poaBarAligner.c
index d5b85f860..7fc97e33d 100644
--- a/bar/impl/poaBarAligner.c
+++ b/bar/impl/poaBarAligner.c
@@ -461,7 +461,7 @@ static void msa_fix_trimmed(Msa* msa) {
 }
 
 Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no, int64_t window_size,
-                                      abpoa_para_t *poa_parameters) {
+                                      int64_t max_prog_rows, abpoa_para_t *poa_parameters) {
 
     assert(seq_no > 0);
 
@@ -564,6 +564,9 @@ Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no
         // init abpoa
         abpoa_t *ab = abpoa_init();
         abpoa_para_t *abpt = copy_abpoa_params(poa_parameters);
+        if (msa->seq_no > max_prog_rows) {
+            abpt->progressive_poa = 0;
+        }
         
 #ifdef CACTUS_ABPOA_MSA_DUMP_DIR
         // dump the input to file
@@ -745,7 +748,7 @@ Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no
 
 Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_lengths, char ***end_strings,
         int **end_string_lengths, int64_t **right_end_indexes, int64_t **right_end_row_indexes, int64_t **overlaps,
-        int64_t window_size, abpoa_para_t *poa_parameters) {
+        int64_t window_size, int64_t max_prog_rows, abpoa_para_t *poa_parameters) {
     // Calculate the initial, potentially inconsistent msas and column scores for each msa
     float *column_scores[end_no];
     Msa **msas = st_malloc(sizeof(Msa *) * end_no);
@@ -754,7 +757,7 @@ Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_leng
 //#endif
     for(int64_t i=0; i<end_no; i++) {
         msas[i] = msa_make_partial_order_alignment(end_strings[i], end_string_lengths[i], end_lengths[i], window_size,
-                                                   poa_parameters);
+                                                   max_prog_rows, poa_parameters);
         column_scores[i] = make_column_scores(msas[i]);
     }
 
@@ -1094,7 +1097,7 @@ int64_t getMaxSequenceLength(End *end) {
 }
 
 stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter,
-                                  abpoa_para_t * poa_parameters) {
+                                  int64_t max_prog_rows, abpoa_para_t * poa_parameters) {
     End *dominantEnd = getDominantEnd(flower);
     int64_t seq_no = dominantEnd != NULL ? end_getInstanceNumber(dominantEnd) : -1;
     if(dominantEnd != NULL && getMaxSequenceLength(dominantEnd) < max_seq_length) {
@@ -1110,7 +1113,7 @@ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_
         Cap *indices_to_caps[seq_no];
 
         get_end_sequences(dominantEnd, end_strings, end_string_lengths, overlaps, indices_to_caps, max_seq_length, mask_filter);
-        Msa *msa = msa_make_partial_order_alignment(end_strings, end_string_lengths, seq_no, window_size, poa_parameters);
+        Msa *msa = msa_make_partial_order_alignment(end_strings, end_string_lengths, seq_no, window_size, max_prog_rows, poa_parameters);
 
         //Now convert to set of alignment blocks
         stList *alignment_blocks = stList_construct3(0, (void (*)(void *))alignmentBlock_destruct);
@@ -1184,7 +1187,7 @@ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_
     // Now make the consistent MSAs
     Msa **msas = make_consistent_partial_order_alignments(end_no, end_lengths, end_strings, end_string_lengths,
                                                           right_end_indexes, right_end_row_indexes, overlaps, window_size,
-                                                          poa_parameters);
+                                                          max_prog_rows, poa_parameters);
 
     // Temp debug output
     //for(int64_t i=0; i<end_no; i++) {
diff --git a/bar/inc/poaBarAligner.h b/bar/inc/poaBarAligner.h
index 2d3a65af0..2b2ed85fa 100644
--- a/bar/inc/poaBarAligner.h
+++ b/bar/inc/poaBarAligner.h
@@ -68,6 +68,7 @@ void msa_print(Msa *msa, FILE *f);
  * @param seq_lens An array giving the string lengths
  * @param seq_no The number of strings
  * @param window_size Sliding window size which limits length of poa sub-alignments.  Memory usage is quardatic in this. 
+ * @param max_prog_rows Disable abpoas progressive alignment if there are more than this many rows (avoid quadratic dist mat blowup)
  * @param poa_parameters abpoa parameters
  * @return An msa of the strings.
  */
@@ -75,6 +76,7 @@ Msa *msa_make_partial_order_alignment(char **seqs,
                                       int *seq_lens,
                                       int64_t seq_no,
                                       int64_t window_size,
+                                      int64_t max_prog_rows,
                                       abpoa_para_t *poa_parameters);
 
 /**
@@ -96,12 +98,13 @@ Msa *msa_make_partial_order_alignment(char **seqs,
  * @param right_end_row_indexes For each string, the index of the row of its reverse complement
  * @param overlaps For each prefix string, the length of the overlap with its reverse complement adjacency
  * @param window_size Sliding window size which limits length of poa sub-alignments.  Memory usage is quardatic in this. 
+ * @param max_prog_rows Disable abpoas progressive alignment if there are more than this many rows (avoid quadratic dist mat blowup)
  * @param poa_parameters abpoa parameters
  * @return A consistent Msa for each end
  */
 Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_lengths, char ***end_strings,
         int **end_string_lengths, int64_t **right_end_indexes, int64_t **right_end_row_indexes, int64_t **overlaps,
-        int64_t window_size, abpoa_para_t *poa_parameters);
+        int64_t window_size, int64_t max_prog_rows, abpoa_para_t *poa_parameters);
 
 /**
  * Represents a gapless alignment of a set of sequences.
@@ -136,14 +139,14 @@ char *get_adjacency_string(Cap *cap, int *length, bool return_string);
  * to attempt to align.
  * @param window_size Sliding window size which limits length of poa sub-alignments.  Memory usage is quardatic in this. 
  * @param mask_filter Trim input sequences if encountering this many consecutive soft of hard masked bases (0 = disabled)
- * @param poa_band_constant abpoa "b" parameter, where adaptive band is b+f*<length> (b < 0 = disabled)
- * @param poa_band_fraction abpoa "f" parameter, where adaptive band is b+f*<length> (b < 0 = disabled)
- * Returns a list of AlignmentBlock ojects
+ * @param max_prog_rows Disable abpoas progressive alignment if there are more than this many rows (avoid quadratic dist mat blowup)
+ * @param poa_parameters abpoa parameters
  */
 stList *make_flower_alignment_poa(Flower *flower,
                                   int64_t max_seq_length,
                                   int64_t window_size,
                                   int64_t mask_filter,
+                                  int64_t max_prog_rows,
                                   abpoa_para_t * poa_parameters);
 
 /**
diff --git a/bar/tests/poaBarTest.c b/bar/tests/poaBarTest.c
index d9950dd0c..adb10ae5e 100644
--- a/bar/tests/poaBarTest.c
+++ b/bar/tests/poaBarTest.c
@@ -65,7 +65,7 @@ void test_make_partial_order_alignment(CuTest *testCase) {
             }
 
             // generate the alignment
-            Msa *msa = msa_make_partial_order_alignment(seqs, seq_lens, seq_no, poa_window_size, abpt);
+            Msa *msa = msa_make_partial_order_alignment(seqs, seq_lens, seq_no, poa_window_size, 1000, abpt);
 
             // print the msa
 #ifdef stderr_logging
@@ -145,7 +145,7 @@ void test_make_consistent_partial_order_alignments_two_ends(CuTest *testCase) {
 
         // generate the alignments
         Msa **msas = make_consistent_partial_order_alignments(end_no, end_lengths, end_strings, end_string_lengths,
-                                                              right_end_indexes, right_end_row_indexes, overlaps, 1000000, abpt);
+                                                              right_end_indexes, right_end_row_indexes, overlaps, 1000000, 100, abpt);
 
         // print the msas
 #ifdef stderr_logging
@@ -212,7 +212,7 @@ void test_make_flower_alignment_poa(CuTest *testCase) {
     }
     flower_destructEndIterator(endIterator);
 
-    stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, abpt);
+    stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, 1000, abpt);
 
     for(int64_t i=0; i<stList_length(alignment_blocks); i++) {
         AlignmentBlock *b = stList_get(alignment_blocks, i);
@@ -233,7 +233,7 @@ void test_alignment_block_iterator(CuTest *testCase) {
     abpt->wf = 0.01;
     abpoa_post_set_para(abpt);
 
-    stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, abpt);
+    stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, 50, abpt);
 
     abpoa_free_para(abpt);
 #ifdef stderr_logging
diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml
index 64ff659d4..e5f363980 100644
--- a/src/cactus/cactus_progressive_config.xml
+++ b/src/cactus/cactus_progressive_config.xml
@@ -248,7 +248,8 @@
 		<!-- partialOrderAlignmentMinimizerK abpoa kmer size for minimizer seeding. -->
 		<!-- partialOrderAlignmentMinimizerW abpoa window size for minimizer seeding. -->
 		<!-- partialOrderAlignmentMinimizerMinW abpoa minimum window size. -->
-		<!-- partialOrderAlignmentProgressiveMode= use guide tree from jaccard distance matrix to determine poa order -->
+		<!-- partialOrderAlignmentProgressiveMode use guide tree from jaccard distance matrix to determine poa order -->
+		<!-- partialOrderAlignmentProgressiveMaxRows disable progressive mode if there are more than this many rows to align -->
 		<poa
 			partialOrderAlignmentWindow="10000"
 			partialOrderAlignmentMaskFilter="-1"
@@ -264,6 +265,7 @@
 			partialOrderAlignmentMinimizerW="10"
 			partialOrderAlignmentMinimizerMinW="500"
 			partialOrderAlignmentProgressiveMode="1"
+			partialOrderAlignmentProgressiveMaxRows="5000"
 		/>
 	</bar>
 

From 5c707814d43e01892dc914ebade37ced45e3ffd2 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 29 Jul 2024 09:28:58 -0400
Subject: [PATCH 4/4] leave old spanning logic alone

---
 src/cactus/progressive/multiCactusTree.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cactus/progressive/multiCactusTree.py b/src/cactus/progressive/multiCactusTree.py
index 37f81a4c2..a4cb06717 100644
--- a/src/cactus/progressive/multiCactusTree.py
+++ b/src/cactus/progressive/multiCactusTree.py
@@ -94,8 +94,7 @@ def extractSpanningTree(self, nodes, root_id=None):
             # this is a simple star tree, don't need path-finding logic below which doesn't scale to giant trees
             nodesToInclude = nodeIds + [root_id]
         else:
-            sourceId = root_id if root_id is not None else nodeIds[0]
-            paths = [dijkstra_path(self.nxDg.to_undirected(), source=sourceId, target=x) for x in nodeIds[1:]]
+            paths = [dijkstra_path(self.nxDg.to_undirected(), source=nodeIds[0], target=x) for x in nodeIds[1:]]
             nodesToInclude = set()
             for path in paths:
                 for node in path: