Skip to content

Commit

Permalink
Merge pull request #1451 from ComparativeGenomicsToolkit/oneshot
Browse files Browse the repository at this point in the history
Scaling improvements in Cactus alignment stage for high-depth pangenomes.
  • Loading branch information
glennhickey authored Jul 29, 2024
2 parents 634fb26 + 5c70781 commit 6a64782
Show file tree
Hide file tree
Showing 8 changed files with 39 additions and 26 deletions.
3 changes: 2 additions & 1 deletion bar/impl/bar.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList *
// Note that poa uses about N^2 memory, so maximum value is generally in 10s of kb
int64_t poaWindow = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentWindow");
int64_t maskFilter = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentMaskFilter");
int64_t poaMaxProgRows = cactusParams_get_int(params, 3, "bar", "poa", "partialOrderAlignmentProgressiveMaxRows");
abpoa_para_t *poaParameters = usePoa ? abpoaParamaters_constructFromCactusParams(params) : NULL;

//////////////////////////////////////////////
Expand Down Expand Up @@ -101,7 +102,7 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList *
*
* It does not use any precomputed alignments, if they are provided they will be ignored
*/
alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaParameters);
alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaMaxProgRows, poaParameters);
st_logDebug("Created the poa alignments: %" PRIi64 " poa alignment blocks for flower\n", stList_length(alignments));
} else {
alignments = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength,
Expand Down
15 changes: 9 additions & 6 deletions bar/impl/poaBarAligner.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ static void msa_fix_trimmed(Msa* msa) {
}

Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no, int64_t window_size,
abpoa_para_t *poa_parameters) {
int64_t max_prog_rows, abpoa_para_t *poa_parameters) {

assert(seq_no > 0);

Expand Down Expand Up @@ -564,6 +564,9 @@ Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no
// init abpoa
abpoa_t *ab = abpoa_init();
abpoa_para_t *abpt = copy_abpoa_params(poa_parameters);
if (msa->seq_no > max_prog_rows) {
abpt->progressive_poa = 0;
}

#ifdef CACTUS_ABPOA_MSA_DUMP_DIR
// dump the input to file
Expand Down Expand Up @@ -745,7 +748,7 @@ Msa *msa_make_partial_order_alignment(char **seqs, int *seq_lens, int64_t seq_no

Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_lengths, char ***end_strings,
int **end_string_lengths, int64_t **right_end_indexes, int64_t **right_end_row_indexes, int64_t **overlaps,
int64_t window_size, abpoa_para_t *poa_parameters) {
int64_t window_size, int64_t max_prog_rows, abpoa_para_t *poa_parameters) {
// Calculate the initial, potentially inconsistent msas and column scores for each msa
float *column_scores[end_no];
Msa **msas = st_malloc(sizeof(Msa *) * end_no);
Expand All @@ -754,7 +757,7 @@ Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_leng
//#endif
for(int64_t i=0; i<end_no; i++) {
msas[i] = msa_make_partial_order_alignment(end_strings[i], end_string_lengths[i], end_lengths[i], window_size,
poa_parameters);
max_prog_rows, poa_parameters);
column_scores[i] = make_column_scores(msas[i]);
}

Expand Down Expand Up @@ -1094,7 +1097,7 @@ int64_t getMaxSequenceLength(End *end) {
}

stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter,
abpoa_para_t * poa_parameters) {
int64_t max_prog_rows, abpoa_para_t * poa_parameters) {
End *dominantEnd = getDominantEnd(flower);
int64_t seq_no = dominantEnd != NULL ? end_getInstanceNumber(dominantEnd) : -1;
if(dominantEnd != NULL && getMaxSequenceLength(dominantEnd) < max_seq_length) {
Expand All @@ -1110,7 +1113,7 @@ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_
Cap *indices_to_caps[seq_no];

get_end_sequences(dominantEnd, end_strings, end_string_lengths, overlaps, indices_to_caps, max_seq_length, mask_filter);
Msa *msa = msa_make_partial_order_alignment(end_strings, end_string_lengths, seq_no, window_size, poa_parameters);
Msa *msa = msa_make_partial_order_alignment(end_strings, end_string_lengths, seq_no, window_size, max_prog_rows, poa_parameters);

//Now convert to set of alignment blocks
stList *alignment_blocks = stList_construct3(0, (void (*)(void *))alignmentBlock_destruct);
Expand Down Expand Up @@ -1184,7 +1187,7 @@ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_
// Now make the consistent MSAs
Msa **msas = make_consistent_partial_order_alignments(end_no, end_lengths, end_strings, end_string_lengths,
right_end_indexes, right_end_row_indexes, overlaps, window_size,
poa_parameters);
max_prog_rows, poa_parameters);

// Temp debug output
//for(int64_t i=0; i<end_no; i++) {
Expand Down
11 changes: 7 additions & 4 deletions bar/inc/poaBarAligner.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,15 @@ void msa_print(Msa *msa, FILE *f);
* @param seq_lens An array giving the string lengths
* @param seq_no The number of strings
* @param window_size Sliding window size which limits length of poa sub-alignments. Memory usage is quardatic in this.
* @param max_prog_rows Disable abpoas progressive alignment if there are more than this many rows (avoid quadratic dist mat blowup)
* @param poa_parameters abpoa parameters
* @return An msa of the strings.
*/
Msa *msa_make_partial_order_alignment(char **seqs,
int *seq_lens,
int64_t seq_no,
int64_t window_size,
int64_t max_prog_rows,
abpoa_para_t *poa_parameters);

/**
Expand All @@ -96,12 +98,13 @@ Msa *msa_make_partial_order_alignment(char **seqs,
* @param right_end_row_indexes For each string, the index of the row of its reverse complement
* @param overlaps For each prefix string, the length of the overlap with its reverse complement adjacency
* @param window_size Sliding window size which limits length of poa sub-alignments. Memory usage is quardatic in this.
* @param max_prog_rows Disable abpoas progressive alignment if there are more than this many rows (avoid quadratic dist mat blowup)
* @param poa_parameters abpoa parameters
* @return A consistent Msa for each end
*/
Msa **make_consistent_partial_order_alignments(int64_t end_no, int64_t *end_lengths, char ***end_strings,
int **end_string_lengths, int64_t **right_end_indexes, int64_t **right_end_row_indexes, int64_t **overlaps,
int64_t window_size, abpoa_para_t *poa_parameters);
int64_t window_size, int64_t max_prog_rows, abpoa_para_t *poa_parameters);

/**
* Represents a gapless alignment of a set of sequences.
Expand Down Expand Up @@ -136,14 +139,14 @@ char *get_adjacency_string(Cap *cap, int *length, bool return_string);
* to attempt to align.
* @param window_size Sliding window size which limits length of poa sub-alignments. Memory usage is quardatic in this.
* @param mask_filter Trim input sequences if encountering this many consecutive soft of hard masked bases (0 = disabled)
* @param poa_band_constant abpoa "b" parameter, where adaptive band is b+f*<length> (b < 0 = disabled)
* @param poa_band_fraction abpoa "f" parameter, where adaptive band is b+f*<length> (b < 0 = disabled)
* Returns a list of AlignmentBlock ojects
* @param max_prog_rows Disable abpoas progressive alignment if there are more than this many rows (avoid quadratic dist mat blowup)
* @param poa_parameters abpoa parameters
*/
stList *make_flower_alignment_poa(Flower *flower,
int64_t max_seq_length,
int64_t window_size,
int64_t mask_filter,
int64_t max_prog_rows,
abpoa_para_t * poa_parameters);

/**
Expand Down
8 changes: 4 additions & 4 deletions bar/tests/poaBarTest.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ void test_make_partial_order_alignment(CuTest *testCase) {
}

// generate the alignment
Msa *msa = msa_make_partial_order_alignment(seqs, seq_lens, seq_no, poa_window_size, abpt);
Msa *msa = msa_make_partial_order_alignment(seqs, seq_lens, seq_no, poa_window_size, 1000, abpt);

// print the msa
#ifdef stderr_logging
Expand Down Expand Up @@ -145,7 +145,7 @@ void test_make_consistent_partial_order_alignments_two_ends(CuTest *testCase) {

// generate the alignments
Msa **msas = make_consistent_partial_order_alignments(end_no, end_lengths, end_strings, end_string_lengths,
right_end_indexes, right_end_row_indexes, overlaps, 1000000, abpt);
right_end_indexes, right_end_row_indexes, overlaps, 1000000, 100, abpt);

// print the msas
#ifdef stderr_logging
Expand Down Expand Up @@ -212,7 +212,7 @@ void test_make_flower_alignment_poa(CuTest *testCase) {
}
flower_destructEndIterator(endIterator);

stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, abpt);
stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, 1000, abpt);

for(int64_t i=0; i<stList_length(alignment_blocks); i++) {
AlignmentBlock *b = stList_get(alignment_blocks, i);
Expand All @@ -233,7 +233,7 @@ void test_alignment_block_iterator(CuTest *testCase) {
abpt->wf = 0.01;
abpoa_post_set_para(abpt);

stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, abpt);
stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, 50, abpt);

abpoa_free_para(abpt);
#ifdef stderr_logging
Expand Down
4 changes: 2 additions & 2 deletions doc/sa_refgraph_hackathon_2023.md
Original file line number Diff line number Diff line change
Expand Up @@ -472,11 +472,11 @@ Note: this section is adpated from [methods for the mc paper](https://github.com

`vg giraffe` will soon be able to map long reads, but is not ready yet. For now, you should use [GraphAligner](https://github.com/maickrau/GraphAligner).

In order for the resulting mapping to be compatible with the `.gbz`, we first convert the `.gbz` into a `.gfa`. You can run `GraphAligner` on the `.gfa` that comes out of `cactus-pangenome`, but the [coordinates will be different](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md#node-chopping) from the `.gbz`.
In order for the resulting mapping to be compatible with the `.gbz`, we first convert the `.gbz` into a `.gfa`. You can run `GraphAligner` on the `.gfa` that comes out of `cactus-pangenome`, but the [coordinates will be different](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md#node-chopping) from the `.gbz` (note the `--vg-algorithm` flag is important for ensuring this).

```
singularity exec -H $(pwd) docker://quay.io/comparative-genomics-toolkit/cactus:v2.6.13 \
bash -c "vg convert ./hprc10/hprc10.gbz -f > ./hprc10/hprc10.gbz.gfa"
bash -c "vg convert ./hprc10/hprc10.gbz -f --vg-algorithm > ./hprc10/hprc10.gbz.gfa"
```

This takes 43 seconds and 10Gb RAM.
Expand Down
4 changes: 3 additions & 1 deletion src/cactus/cactus_progressive_config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,8 @@
<!-- partialOrderAlignmentMinimizerK abpoa kmer size for minimizer seeding. -->
<!-- partialOrderAlignmentMinimizerW abpoa window size for minimizer seeding. -->
<!-- partialOrderAlignmentMinimizerMinW abpoa minimum window size. -->
<!-- partialOrderAlignmentProgressiveMode= use guide tree from jaccard distance matrix to determine poa order -->
<!-- partialOrderAlignmentProgressiveMode use guide tree from jaccard distance matrix to determine poa order -->
<!-- partialOrderAlignmentProgressiveMaxRows disable progressive mode if there are more than this many rows to align -->
<poa
partialOrderAlignmentWindow="10000"
partialOrderAlignmentMaskFilter="-1"
Expand All @@ -264,6 +265,7 @@
partialOrderAlignmentMinimizerW="10"
partialOrderAlignmentMinimizerMinW="500"
partialOrderAlignmentProgressiveMode="1"
partialOrderAlignmentProgressiveMaxRows="5000"
/>
</bar>

Expand Down
16 changes: 10 additions & 6 deletions src/cactus/progressive/multiCactusTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,18 @@ def traverseSubtree(self, root, node):
yield i

# Extracts a tree spanning the nodes with the given names.
def extractSpanningTree(self, nodes):
def extractSpanningTree(self, nodes, root_id=None):
assert len(nodes) > 1
nodeIds = [self.nameToId[name] for name in nodes]
paths = [dijkstra_path(self.nxDg.to_undirected(), source=nodeIds[0], target=x) for x in nodeIds[1:]]
nodesToInclude = set()
for path in paths:
for node in path:
nodesToInclude.add(node)
if root_id is not None and all([self.getParent(n) == root_id for n in nodeIds]):
# this is a simple star tree, don't need path-finding logic below which doesn't scale to giant trees
nodesToInclude = nodeIds + [root_id]
else:
paths = [dijkstra_path(self.nxDg.to_undirected(), source=nodeIds[0], target=x) for x in nodeIds[1:]]
nodesToInclude = set()
for path in paths:
for node in path:
nodesToInclude.add(node)

cpy = self.nxDg.subgraph(nodesToInclude).copy()
# Get rid of nodes that have only 1 children
Expand Down
4 changes: 2 additions & 2 deletions src/cactus/progressive/progressive_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from cactus.progressive.seqFile import SeqFile
from sonLib.nxnewick import NXNewick
from toil.statsAndLogging import logger

from toil.realtimeLogger import RealtimeLogger

def parse_seqfile(seqfile_path, config_wrapper, root_name = None, pangenome = False):
"""
Expand Down Expand Up @@ -201,7 +201,7 @@ def get_spanning_subtree(mc_tree, root_name, config_wrapper, outgroup_map):
node_id_set.add(node_id)

# get the spanning tree
spanning_tree = mc_tree.extractSpanningTree([mc_tree.getName(node) for node in node_id_set])
spanning_tree = mc_tree.extractSpanningTree([mc_tree.getName(node) for node in node_id_set], root_id = root_id)

spanning_tree.computeSubtreeRoots()
return spanning_tree
Expand Down

0 comments on commit 6a64782

Please sign in to comment.