Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge VCF records at the same position together after bcftools norm left shifts them #1536

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@
[submodule "submodules/red"]
path = submodules/red
url = https://github.com/glennhickey/red.git
[submodule "submodules/collapse-bubble"]
path = submodules/collapse-bubble
url = https://github.com/Han-Cao/collapse-bubble.git
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ modules = api setup caf bar hal reference pipeline preprocessor

# submodules are in multiple pass to handle dependencies cactus2hal being dependent on
# both cactus and sonLib
submodules1 = sonLib cPecan hal matchingAndOrdering pinchesAndCacti abPOA lastz paffy red
submodules1 = sonLib cPecan hal matchingAndOrdering pinchesAndCacti abPOA lastz paffy red collapse-bubble
submodules2 = cactus2hal
submodules = ${submodules1} ${submodules2}

Expand Down Expand Up @@ -271,8 +271,12 @@ suball.red:
cd submodules/red && ${MAKE}
ln -f submodules/red/bin/Red ${BINDIR}

suball.collapse-bubble:
chmod +x submodules/collapse-bubble/scripts/merge_duplicates.py
ln -f submodules/collapse-bubble/scripts/merge_duplicates.py ${BINDIR}

subclean.%:
cd submodules/$* && ${MAKE} clean
if [ $(shell ls submodules/$* | grep -i makefile | wc -l) != 0 ]; then cd submodules/$* && ${MAKE} clean; fi

##
# docker
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Cactus uses many different algorithms and individual code contributions, princip
- Andrea Guarracino, Erik Garrison and co-authors for [odgi](https://github.com/pangenome/odgi). Make sure to [cite odgi](https://doi.org/10.1093/bioinformatics/btac308) when using it or its visualizations.
- Hani Z. Girgis for [RED](http://toolsmith.ens.utulsa.edu/)
- Erik Garrison and co-authors for [vcfwave](https://github.com/vcflib/vcflib/blob/master/doc/vcfwave.md). [vcflib citation](https://doi.org/10.1371/journal.pcbi.1009123)
- Martin Frith and collaborators for `last-train` from [last](https://gitlab.com/mcfrith/last). [last-train citation](https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btw742)
- Han Cao for [merge_duplicates](https://github.com/Han-Cao/collapse-bubble) vcf cleanup script

## Installing Manually From Source

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def run(self):
'networkx>=2,<2.8.1',
'pytest',
'cigar',
'biopython'],
'biopython',
'pysam'],

cmdclass = {
'install': PostInstallCommand,
Expand Down
2 changes: 2 additions & 0 deletions src/cactus/cactus_progressive_config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@
<!-- vcfwaveOptions: options to pass to vcfwave (from vcflib) -->
<!-- vcfwaveNorm: run bcftools norm -f (left shift) after vcfwave -->
<!-- bcftoolsNorm: Run bcftools norm -f (left shifting) on (non-raw, non-wave) VCF output (may result in overlapping variants) -->
<!-- mergeDuplicatesOptions: Run merge_duplicates script with these options (empty string means default options) after bcftools norm to prevent multiple entries at the same position. If "0", merge_duplicates will not be run. This applies to normalization as toggled by vcfwaveNorm and/or bctoolsNorm (and is ever run after bcftools norm) -->
<graphmap_join
maxNodeLength="1024"
gfaffix="1"
Expand All @@ -429,6 +430,7 @@
vcfwaveOptions="-I 1000"
vcfwaveNorm="1"
bcftoolsNorm="0"
mergeDuplicatesOptions=""
/>
<!-- hal2vg options -->
<!-- includeMinigraph: include minigraph node sequences as paths in output (note that cactus-graphmap-join will still remove them by default) -->
Expand Down
50 changes: 43 additions & 7 deletions src/cactus/refmap/cactus_graphmap_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,11 +580,13 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids):

# optional vcf
if workflow_phase in options.vcf:
vcf_prev_job = ref_fasta_job if ref_fasta_job else gfa_root_job
for vcf_ref in options.vcfReference:
vcf_job = vcf_prev_job.addFollowOnJobFn(make_vcf, config, options, workflow_phase,
vcf_job = gfa_root_job.addFollowOnJobFn(make_vcf, config, options, workflow_phase,
index_mem, vcf_ref, phase_vg_ids,
ref_fasta_job.rv() if ref_fasta_job else None)
if ref_fasta_job:
ref_fasta_job.addFollowOn(vcf_job)

out_dicts.append(vcf_job.rv())

# optional giraffe
Expand Down Expand Up @@ -1075,14 +1077,23 @@ def vcfbub(job, config, out_name, vcf_ref, vcf_id, tbi_id, max_ref_allele, fasta
vcfbub_path = os.path.join(work_dir, os.path.basename(out_name) + '.' + tag + 'bub.vcf.gz')
assert max_ref_allele
bub_cmd = [['vcfbub', '--input', vcf_path, '--max-ref-length', str(max_ref_allele), '--max-level', '0']]
if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "bcftoolsNorm", typeFn=bool, default=False):
run_norm = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "bcftoolsNorm", typeFn=bool, default=False)
if run_norm:
fa_ref_path = os.path.join(work_dir, tag + 'fa.gz')
job.fileStore.readGlobalFile(fasta_ref_dict[vcf_ref], fa_ref_path)
bub_cmd.append(['bcftools', 'norm', '-f', fa_ref_path])
bub_cmd.append(['bcftools', 'norm', '-m', '+any'])
bub_cmd.append(['bcftools', 'sort', '-T', os.path.join(work_dir, 'bcftools.XXXXXX')])
bub_cmd.append(['bgzip', '--threads', str(job.cores)])
cactus_call(parameters=bub_cmd, outfile=vcfbub_path)

merge_duplicates_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "mergeDuplicatesOptions", typeFn=str, default=None)
if merge_duplicates_opts and merge_duplicates_opts != "0" and run_norm:
#note: merge_duplcates complains about not having a .tbi but I don't think it actually affects anything
merge_path = os.path.join(work_dir, os.path.basename(out_name) + '.' + vcf_ref + tag + 'bub.merge.vcf.gz')
cactus_call(parameters=['merge_duplicates.py', '-i', vcfbub_path, '-o', merge_path] + merge_duplicates_opts.split(' '))
vcfbub_path = merge_path

try:
cactus_call(parameters=['tabix', '-p', 'vcf', vcfbub_path])
tbi_path = vcfbub_path + '.tbi'
Expand Down Expand Up @@ -1112,21 +1123,46 @@ def vcfwave(job, config, out_name, vcf_ref, vcf_id, tbi_id, max_ref_allele, fast
return vcf_id, tbi_id

# run vcfbub and vcfwave, using original HPRC recipe
# allele splitting added here as vcfwave has history of trouble with multi-allelic sites
vcfwave_path = os.path.join(work_dir, os.path.basename(out_name) + '.' + vcf_ref + tag + 'wave.vcf.gz')
wave_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "vcfwaveOptions", typeFn=str, default=None)
assert wave_opts
bubwave_cmd = [['vcfbub', '--input', vcf_path, '-l', '0', '-a', str(max_ref_allele)],
['bcftools', 'annotate', '-x', 'INFO/AT'],
['bcftools', 'annotate', '-x', 'INFO/AT'],
['bcftools', 'norm', '-m', '-any'],
['vcfwave'] + wave_opts.split(' ') + ['-t', str(job.cores)]]

if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "vcfwaveNorm", typeFn=bool, default=True):
run_norm = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "vcfwaveNorm", typeFn=bool, default=True)
merge_duplicates_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "mergeDuplicatesOptions", typeFn=str, default=None)
run_merge = merge_duplicates_opts is not None and merge_duplicates_opts != "0" and run_norm
if run_norm:
fa_ref_path = os.path.join(work_dir, tag + 'fa.gz')
job.fileStore.readGlobalFile(fasta_ref_dict[vcf_ref], fa_ref_path)
bubwave_cmd.append(['bcftools', 'norm', '-f', fa_ref_path])
bubwave_cmd.append(['bcftools', 'sort', '-T', os.path.join(work_dir, 'bcftools.XXXXXX')])

bubwave_cmd.append(['bgzip'])

cactus_call(parameters=bubwave_cmd, outfile=vcfwave_path)

# stable sort, which is apparently not guaranteed by bcftools sort
# (in case we merge later, it's best to preserve variant order)
temp_path = os.path.join(work_dir, os.path.basename(out_name) + '.' + vcf_ref + tag + 'wave.tmp.vcf.gz')
cactus_call(parameters=['bcftools', 'view', '-Oz', '-h', vcfwave_path], outfile=temp_path)
cactus_call(parameters=[['bcftools', 'view', '-H', vcfwave_path],
['sort', '-k1,1d', '-k2,2n', '-s', '-T', work_dir],
['bgzip']], outfile=temp_path, outappend=True)
vcfwave_path, temp_path = temp_path, vcfwave_path

if run_merge:
#note: merge_duplcates complains about not having a .tbi but I don't think it actually affects anything
merge_cmd = ['merge_duplicates.py', '-i', vcfwave_path, '-o', temp_path]
if merge_duplicates_opts:
merge_cmd.append(merge_duplicates_opts.split(' '))
cactus_call(parameters=merge_cmd)
vcfwave_path, temp_path = temp_path, vcfwave_path

cactus_call(parameters=['bcftools', 'norm', '-m', '+any', '-Oz', vcfwave_path], outfile=temp_path)
vcfwave_path, temp_path = temp_path, vcfwave_path

try:
cactus_call(parameters=['tabix', '-p', 'vcf', vcfwave_path])
tbi_path = vcfwave_path + '.tbi'
Expand Down
1 change: 1 addition & 0 deletions submodules/collapse-bubble
Submodule collapse-bubble added at 76070b