Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass ID range components around as strings #825

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/toil_vg/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,22 +114,23 @@ def get_out_store(self):
else:
return None

def write_intermediate_file(self, job, path):
def write_intermediate_file(self, job, path, out_store_path = None):
"""
Write the file at the given path to the given job's Toil FileStore, and
to the out_store if one is in use and we are trying to dump intermediate
files.

In the out_store, the file is saved under its basename, in the root
directory.
directory. If out_store_path is set, the file is saved there instead.

Returns the Toil file ID for the written file.
"""

out_store = self.get_out_store()
if out_store is not None and self.config.force_outstore:
name = out_store_path if out_store_path else os.path.basename(path)
# Save to the out_store if it exists
out_store.write_output_file(path, os.path.basename(path))
out_store.write_output_file(path, name)

# Save to Toil
return job.fileStore.writeGlobalFile(path)
Expand Down
36 changes: 34 additions & 2 deletions src/toil_vg/test/test_vg.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def test_06_BRCA1_NA12877(self):
self._assertOutput('NA12877', self.local_outstore, f1_threshold=0.45)

def test_07_BRCA1_BRCA2_NA12877(self):
''' Test pipeline on chase with two chromosomes, in this case both BRCA regions
''' Test pipeline on case with two chromosomes, in this case both BRCA regions
'''
self._download_input('NA12877.brca1.brca2.bam.fq.gz')
self._download_input('snp1kg-brca1.vg')
Expand Down Expand Up @@ -595,7 +595,39 @@ def _test_09_construct(self, container_override):
if prev_vg_size is not None:
assert vg_size <= prev_vg_size
prev_vg_size = vg_size


def test_10_construct_multiple_contigs(self):
''' Test ability to group construction jobs, in the chape of GRCh38
'''

chrom_names = [f'{x}' for x in range(1,23)] + ['X', 'Y']
vcf_bases = [f'chr{x}' for x in chrom_names] + ['others']
region_names = chrom_names + ['chrM']

in_vcfs = [self._ci_input_path(f'GRCh38.1000gp.fake.{vcf_base}.vcf.gz') for vcf_base in vcf_bases]
in_tbis = [in_vcf + '.tbi' for in_vcf in in_vcfs]
in_fa = self._ci_input_path('GRCh38.fake.fa')
in_coalesce_regions = self._ci_input_path('GRCh38.1000gp.fake.minor_contigs.tsv')
out_name = 'Fake1000GP'

print("Construct to " + self.local_outstore)

command = ['toil-vg', 'construct', self.jobStoreLocal, self.local_outstore,
'--container', self.containerType,
'--clean', 'never',
'--fasta', in_fa, '--vcf'] + in_vcfs + ['--vcf_phasing'] + in_vcfs + [
'--regions'] + region_names + ['--fasta_regions', '--remove_chr_prefix',
'--out_name', out_name, '--pangenome', '--filter_ceph', '--min_af', '0.01',
'--all_index',
'--realTimeLogging', '--logInfo', '--coalesce_regions', in_coalesce_regions]

self._run(command)
self._run(['toil', 'clean', self.jobStoreLocal])

for middle in ['_', '_filter_', '_minaf_0.01_']:
# Should now leave a coalesced region
self.assertTrue(os.path.isfile(os.path.join(self.local_outstore, '{}{}coalesced0.vg'.format(out_name, middle))))

def test_11_gbwt(self):
'''
Test that the gbwt gets constructed without crashing (but not much beyond that)
Expand Down
67 changes: 67 additions & 0 deletions src/toil_vg/vg_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,10 @@ def call_with_docker(self, job, args, work_dir, outfile, errfile, check_output,

# When we get here, the container has been run, and stdout is either in the file object we sent it to or in the Docker logs.
# stderr is always in the Docker logs.

if isinstance(return_code, dict) and 'StatusCode' in return_code:
# New? Docker gives us a dict like this
return_code = return_code['StatusCode']

if return_code != 0:
# What were we doing?
Expand Down Expand Up @@ -1087,3 +1091,66 @@ def resolve(self, result):
return result.result()
else:
return result


def apply_coalesce(regions, region_names=None, coalesce_regions=[]):
"""
Given a list of regions, and a list of sets of regions to coalesce if all
are present, produce a list of regions or sets of regions, preserving the
original region order among the non-coalesced regions, with the sets at the
end.

Also takes a list of region names. If not set, the regions themselves are
used.

If all the regions in a set are present, they will all be pulled out of the
normal ordering and put in that set at the end.

Returns (coalesced regions, coalesced region names)
"""

if not region_names:
region_names = regions

wanted_regions = set(regions)
# We need to fake the output names for the coalesced regions based on
# the original ones. So map from region to region name.
region_to_name = dict(zip(regions, region_names))
# These will replace regions and region_names if we coalesce away regions
coalesced_regions = []
coalesced_names = []
coalesced_away = set()

for to_coalesce in coalesce_regions:
# Find out if we have all the regions that need to be coalesced here.
have_all = True
for region in to_coalesce:
if region not in wanted_regions:
have_all = False
break
if have_all:
# Use this coalescing
coalesced_regions.append(to_coalesce)

# Try and replace the region in its name, if possible, when naming the coalesced region.
region_in_name = region_to_name[region].rfind(region)
base_name = region_to_name[region][:region_in_name] if region_in_name != -1 else region_to_name[region]
coalesced_names.append("{}coalesced{}".format(base_name, len(coalesced_regions) - 1))
# And skip these regions
coalesced_away.update(to_coalesce)

if len(coalesced_away) > 0:
# Drop the coalesced regions from regions
remaining_regions = []
remaining_names = []
for i in range(len(regions)):
if regions[i] not in coalesced_away:
remaining_regions.append(regions[i])
remaining_names.append(region_names[i])
# And replace the original regions with the coalesced ones, and the
# remaining uncoalesced regions. Put the remaining ones first because
# they are probably big chromosomes and may have associated VCFs.
regions = remaining_regions + coalesced_regions
region_names = remaining_names + coalesced_names

return (regions, region_names)
4 changes: 2 additions & 2 deletions src/toil_vg/vg_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@
## of through docker.

# Docker image to use for vg
vg-docker: 'quay.io/vgteam/vg:v1.28.0'
vg-docker: 'quay.io/vgteam/vg:ci-2850-58eed1cb13f9444849933685a7f51a8dc9273ca6'

# Docker image to use for bcftools
bcftools-docker: 'quay.io/biocontainers/bcftools:1.9--h4da6232_0'
Expand Down Expand Up @@ -463,7 +463,7 @@
## of through docker.

# Docker image to use for vg
vg-docker: 'quay.io/vgteam/vg:v1.24.0'
vg-docker: 'quay.io/vgteam/vg:ci-2850-58eed1cb13f9444849933685a7f51a8dc9273ca6'

# Docker image to use for bcftools
bcftools-docker: 'quay.io/biocontainers/bcftools:1.9--h4da6232_0'
Expand Down
Loading