From ba527f107db790232c4a565e0d07bc56040c1666 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 Jul 2024 11:14:55 -0400 Subject: [PATCH 1/4] skip spanning subtree step for star trees --- src/cactus/progressive/multiCactusTree.py | 17 +++++++++++------ .../progressive/progressive_decomposition.py | 4 ++-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/cactus/progressive/multiCactusTree.py b/src/cactus/progressive/multiCactusTree.py index ce9541fb2..37f81a4c2 100644 --- a/src/cactus/progressive/multiCactusTree.py +++ b/src/cactus/progressive/multiCactusTree.py @@ -87,14 +87,19 @@ def traverseSubtree(self, root, node): yield i # Extracts a tree spanning the nodes with the given names. - def extractSpanningTree(self, nodes): + def extractSpanningTree(self, nodes, root_id=None): assert len(nodes) > 1 nodeIds = [self.nameToId[name] for name in nodes] - paths = [dijkstra_path(self.nxDg.to_undirected(), source=nodeIds[0], target=x) for x in nodeIds[1:]] - nodesToInclude = set() - for path in paths: - for node in path: - nodesToInclude.add(node) + if root_id is not None and all([self.getParent(n) == root_id for n in nodeIds]): + # this is a simple star tree, don't need path-finding logic below which doesn't scale to giant trees + nodesToInclude = nodeIds + [root_id] + else: + sourceId = root_id if root_id is not None else nodeIds[0] + paths = [dijkstra_path(self.nxDg.to_undirected(), source=sourceId, target=x) for x in nodeIds[1:]] + nodesToInclude = set() + for path in paths: + for node in path: + nodesToInclude.add(node) cpy = self.nxDg.subgraph(nodesToInclude).copy() # Get rid of nodes that have only 1 children diff --git a/src/cactus/progressive/progressive_decomposition.py b/src/cactus/progressive/progressive_decomposition.py index 374d188f2..387298022 100644 --- a/src/cactus/progressive/progressive_decomposition.py +++ b/src/cactus/progressive/progressive_decomposition.py @@ -31,7 +31,7 @@ from cactus.progressive.seqFile import SeqFile from sonLib.nxnewick import NXNewick from toil.statsAndLogging import logger - +from toil.realtimeLogger import RealtimeLogger def parse_seqfile(seqfile_path, config_wrapper, root_name = None, pangenome = False): """ @@ -201,7 +201,7 @@ def get_spanning_subtree(mc_tree, root_name, config_wrapper, outgroup_map): node_id_set.add(node_id) # get the spanning tree - spanning_tree = mc_tree.extractSpanningTree([mc_tree.getName(node) for node in node_id_set]) + spanning_tree = mc_tree.extractSpanningTree([mc_tree.getName(node) for node in node_id_set], root_id = root_id) spanning_tree.computeSubtreeRoots() return spanning_tree From e7b5c433b70429cd1c4b913ddc4209d5d74a6680 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 Jul 2024 14:06:52 -0400 Subject: [PATCH 2/4] fix convert command for graphaligner --- doc/sa_refgraph_hackathon_2023.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/sa_refgraph_hackathon_2023.md b/doc/sa_refgraph_hackathon_2023.md index f9b5f6102..642334f23 100644 --- a/doc/sa_refgraph_hackathon_2023.md +++ b/doc/sa_refgraph_hackathon_2023.md @@ -472,11 +472,11 @@ Note: this section is adpated from [methods for the mc paper](https://github.com `vg giraffe` will soon be able to map long reads, but is not ready yet. For now, you should use [GraphAligner](https://github.com/maickrau/GraphAligner). -In order for the resulting mapping to be compatible with the `.gbz`, we first convert the `.gbz` into a `.gfa`. You can run `GraphAligner` on the `.gfa` that comes out of `cactus-pangenome`, but the [coordinates will be different](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md#node-chopping) from the `.gbz`. +In order for the resulting mapping to be compatible with the `.gbz`, we first convert the `.gbz` into a `.gfa`. You can run `GraphAligner` on the `.gfa` that comes out of `cactus-pangenome`, but the [coordinates will be different](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md#node-chopping) from the `.gbz` (note the `--vg-algorithm` flag is important for ensuring this). ``` singularity exec -H $(pwd) docker://quay.io/comparative-genomics-toolkit/cactus:v2.6.13 \ -bash -c "vg convert ./hprc10/hprc10.gbz -f > ./hprc10/hprc10.gbz.gfa" +bash -c "vg convert ./hprc10/hprc10.gbz -f --vg-algorithm > ./hprc10/hprc10.gbz.gfa" ``` This takes 43 seconds and 10Gb RAM. From f90e86b6ff9590579e9a1a4cecd2b77a5d8f2f61 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 Jul 2024 15:52:01 -0400 Subject: [PATCH 3/4] turn off abpoa progressive for >5k rows --- bar/impl/bar.c | 3 ++- bar/impl/poaBarAligner.c | 15 +++++++++------ bar/inc/poaBarAligner.h | 11 +++++++---- bar/tests/poaBarTest.c | 8 ++++---- src/cactus/cactus_progressive_config.xml | 4 +++- 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/bar/impl/bar.c b/bar/impl/bar.c index 36ff8ef68..1bd9ed4b4 100644 --- a/bar/impl/bar.c +++ b/bar/impl/bar.c @@ -70,6 +70,7 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList * // Note that poa uses about N^2 memory, so maximum value is generally in 10s of kb int64_t poaWindow = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentWindow"); int64_t maskFilter = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentMaskFilter"); + int64_t poaMaxProgRows = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentProgressiveMaxRows"); abpoa_para_t *poaParameters = usePoa ? abpoaParamaters_constructFromCactusParams(params) : NULL; ////////////////////////////////////////////// @@ -101,7 +102,7 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList * * * It does not use any precomputed alignments, if they are provided they will be ignored */ - alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaParameters); + alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaMaxProgRows, poaParameters); st_logDebug("Created the poa alignments: %" PRIi64 " poa alignment blocks for flower\n", stList_length(alignments)); } else { alignments = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength, diff --git a/bar/impl/poaBarAligner.c b/bar/impl/poaBarAligner.c index d5b85f860..7fc97e33d 100644 --- a/bar/impl/poaBarAligner.c +++ b/bar/impl/poaBarAligner.c @@ -461,7 +461,7 @@ static void msa_fix_trimmed(Msa* msa) { } Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no, int64_t window_size, - abpoa_para_t *poa_parameters) { + int64_t max_prog_rows, abpoa_para_t *poa_parameters) { assert(seq_no > 0); @@ -564,6 +564,9 @@ Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no // init abpoa abpoa_t *ab = abpoa_init(); abpoa_para_t *abpt = copy_abpoa_params(poa_parameters); + if (msa->seq_no > max_prog_rows) { + abpt->progressive_poa = 0; + } #ifdef CACTUS_ABPOA_MSA_DUMP_DIR // dump the input to file @@ -745,7 +748,7 @@ Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_lengths, char ***end_strings, int **end_string_lengths, int64_t **right_end_indexes, int64_t **right_end_row_indexes, int64_t **overlaps, - int64_t window_size, abpoa_para_t *poa_parameters) { + int64_t window_size, int64_t max_prog_rows, abpoa_para_t *poa_parameters) { // Calculate the initial, potentially inconsistent msas and column scores for each msa float *column_scores[end_no]; Msa **msas = st_malloc(sizeof(Msa *) * end_no); @@ -754,7 +757,7 @@ Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_leng //#endif for(int64_t i=0; i (b < 0 = disabled) - * @param poa_band_fraction abpoa "f" parameter, where adaptive band is b+f* (b < 0 = disabled) - * Returns a list of AlignmentBlock ojects + * @param max_prog_rows Disable abpoas progressive alignment if there are more than this many rows (avoid quadratic dist mat blowup) + * @param poa_parameters abpoa parameters */ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter, + int64_t max_prog_rows, abpoa_para_t * poa_parameters); /** diff --git a/bar/tests/poaBarTest.c b/bar/tests/poaBarTest.c index d9950dd0c..adb10ae5e 100644 --- a/bar/tests/poaBarTest.c +++ b/bar/tests/poaBarTest.c @@ -65,7 +65,7 @@ void test_make_partial_order_alignment(CuTest *testCase) { } // generate the alignment - Msa *msa = msa_make_partial_order_alignment(seqs, seq_lens, seq_no, poa_window_size, abpt); + Msa *msa = msa_make_partial_order_alignment(seqs, seq_lens, seq_no, poa_window_size, 1000, abpt); // print the msa #ifdef stderr_logging @@ -145,7 +145,7 @@ void test_make_consistent_partial_order_alignments_two_ends(CuTest *testCase) { // generate the alignments Msa **msas = make_consistent_partial_order_alignments(end_no, end_lengths, end_strings, end_string_lengths, - right_end_indexes, right_end_row_indexes, overlaps, 1000000, abpt); + right_end_indexes, right_end_row_indexes, overlaps, 1000000, 100, abpt); // print the msas #ifdef stderr_logging @@ -212,7 +212,7 @@ void test_make_flower_alignment_poa(CuTest *testCase) { } flower_destructEndIterator(endIterator); - stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, abpt); + stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, 1000, abpt); for(int64_t i=0; iwf = 0.01; abpoa_post_set_para(abpt); - stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, abpt); + stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, 50, abpt); abpoa_free_para(abpt); #ifdef stderr_logging diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index 64ff659d4..e5f363980 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -248,7 +248,8 @@ - + + From 5c707814d43e01892dc914ebade37ced45e3ffd2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Jul 2024 09:28:58 -0400 Subject: [PATCH 4/4] leave old spanning logic alone --- src/cactus/progressive/multiCactusTree.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cactus/progressive/multiCactusTree.py b/src/cactus/progressive/multiCactusTree.py index 37f81a4c2..a4cb06717 100644 --- a/src/cactus/progressive/multiCactusTree.py +++ b/src/cactus/progressive/multiCactusTree.py @@ -94,8 +94,7 @@ def extractSpanningTree(self, nodes, root_id=None): # this is a simple star tree, don't need path-finding logic below which doesn't scale to giant trees nodesToInclude = nodeIds + [root_id] else: - sourceId = root_id if root_id is not None else nodeIds[0] - paths = [dijkstra_path(self.nxDg.to_undirected(), source=sourceId, target=x) for x in nodeIds[1:]] + paths = [dijkstra_path(self.nxDg.to_undirected(), source=nodeIds[0], target=x) for x in nodeIds[1:]] nodesToInclude = set() for path in paths: for node in path: