Skip to content

Commit

Permalink
Handle 10x data with no I1 file (#94)
Browse files Browse the repository at this point in the history
* Handle 10x data with no I1 file

* Fix fastq file names

* Add test for getting cellranger inputs

* Rename cellranger inputs function

* Update pipeline-tools docker tag
  • Loading branch information
samanehsan authored Oct 22, 2018
1 parent 231a2e8 commit c1df3ec
Show file tree
Hide file tree
Showing 5 changed files with 932 additions and 49 deletions.
63 changes: 14 additions & 49 deletions adapter_pipelines/cellranger/adapter.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ task GetInputs {
python -u <<CODE
from pipeline_tools import input_utils
input_utils.create_optimus_input_tsv(
input_utils.get_cellranger_input_files(
"${bundle_uuid}",
"${bundle_version}",
"${dss_url}")
Expand All @@ -36,38 +36,13 @@ task GetInputs {
}
output {
String sample_id = read_string("sample_id.txt")
Array[File] r1_fastq = read_lines("r1.txt")
Array[File] r2_fastq = read_lines("r2.txt")
Array[File] i1_fastq = read_lines("i1.txt")
Array[Int] lanes = read_lines("lanes.txt")
Array[File] fastqs = read_lines("fastqs.txt")
Array[String] fastq_names = read_lines("fastq_names.txt")
Array[File] http_requests = glob("request_*.txt")
Array[File] http_responses = glob("response_*.txt")
}
}
task RenameFastqFiles {
File r1
File r2
File i1
String sample_id
String lane
String pipeline_tools_version
command <<<
mv ${r1} '${sample_id}_S1_L00${lane}_R1_001.fastq.gz'
mv ${r2} '${sample_id}_S1_L00${lane}_R2_001.fastq.gz'
mv ${i1} '${sample_id}_S1_L00${lane}_I1_001.fastq.gz'
>>>
runtime {
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
}
output {
File r1_new = "${sample_id}_S1_L00${lane}_R1_001.fastq.gz"
File r2_new = "${sample_id}_S1_L00${lane}_R2_001.fastq.gz"
File i1_new = "${sample_id}_S1_L00${lane}_I1_001.fastq.gz"
}
}
task RenameFiles {
Array[File] file_paths
Array[String] new_file_names
Expand Down Expand Up @@ -175,7 +150,7 @@ workflow Adapter10xCount {
Int max_cromwell_retries = 0
Boolean add_md5s = false
String pipeline_tools_version = "v0.36.0"
String pipeline_tools_version = "v0.37.0"
call GetInputs {
input:
Expand All @@ -190,39 +165,29 @@ workflow Adapter10xCount {
pipeline_tools_version = pipeline_tools_version
}
# Cellranger code in 10x count wdl requires files to be named a certain way.
# To accommodate that, RenameFastqFiles copies the blue box files into the
# cromwell execution bucket but with the names cellranger expects.
# Putting this in its own task lets us take advantage of automatic localizing
# and delocalizing by Cromwell/JES to actually read and write stuff in buckets.
# TODO: Replace scatter with a for-loop inside of the task to avoid creating a
# VM for each set of files that needs to be renamed
scatter(i in range(length(GetInputs.lanes))) {
call RenameFastqFiles as prep {
input:
r1 = GetInputs.r1_fastq[i],
r2 = GetInputs.r2_fastq[i],
i1 = GetInputs.i1_fastq[i],
sample_id = GetInputs.sample_id,
lane = GetInputs.lanes[i],
pipeline_tools_version = pipeline_tools_version
}
}
# Rename the fastq files to the format required by CellRanger:
# {sample_id}_S1_L001_R1_001.fastq.gz'
call RenameFiles as rename_fastqs {
input:
file_paths = GetInputs.fastqs,
new_file_names = GetInputs.fastq_names,
pipeline_tools_version = pipeline_tools_version
}
# CellRanger gets the paths to the fastq directories from the array of fastqs,
# so the order of those files does not matter
call CellRanger.CellRanger as analysis {
input:
sample_id = GetInputs.sample_id,
fastqs = flatten([prep.r1_new, prep.r2_new, prep.i1_new]),
fastqs = rename_fastqs.outputs,
reference_name = reference_name,
transcriptome_tar_gz = transcriptome_tar_gz,
expect_cells = expect_cells
}
call InputsForSubmit {
input:
fastqs = flatten([GetInputs.r1_fastq, GetInputs.r2_fastq, GetInputs.i1_fastq]),
fastqs = GetInputs.fastqs,
other_inputs = [
{
"name": "sample_id",
Expand Down
58 changes: 58 additions & 0 deletions pipeline_tools/input_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import functools
import typing
from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -167,6 +168,7 @@ def create_ss2_input_tsv(bundle_uuid, bundle_version, dss_url, input_tsv_name='i
print('Wrote input map to disk.')


# TODO: Rename this function since it no longer creates a tsv file
def create_optimus_input_tsv(uuid, version, dss_url):
"""Create TSV of Optimus inputs
Expand Down Expand Up @@ -218,3 +220,59 @@ def create_optimus_input_tsv(uuid, version, dss_url):
f.write('{0}'.format(sample_id))

print('Finished writing files')


def get_cellranger_input_files(uuid, version, dss_url):
""" Get inputs for cellranger count workflow
Args:
uuid (str): the bundle uuid
version (str): the bundle version
dss_url (str): the DCP Data Storage Service
Returns:
None
Raises:
optimus_utils.LaneMissingFileError: if any fastqs are missing
"""
# Get bundle manifest
print('Getting bundle manifest for id {0}, version {1}'.format(uuid, version))
primary_bundle = get_bundle_metadata(uuid=uuid, version=version, dss_url=dss_url, http_requests=HttpRequests())

sample_id = get_sample_id(primary_bundle)
print('Writing sample ID to sample_id.txt')
with open('sample_id.txt', 'w') as f:
f.write('{0}'.format(sample_id))

# Parse inputs from metadata
print('Gathering fastq inputs')
fastq_files = [f for f in primary_bundle.files.values() if f.file_format == 'fastq.gz']
lane_to_fastqs = optimus_utils.create_fastq_dict(fastq_files)

# Stop if any fastqs are missing
optimus_utils.validate_lanes(lane_to_fastqs)

read_indices = {
'read1': 'R1',
'read2': 'R2',
'index1': 'I1'
}
fastq_urls = []
fastq_names = []

for lane, reads in lane_to_fastqs.items():
for read_index, url in reads.items():
new_file_name = '{}_S1_L00{}_{}_001.fastq.gz'.format(sample_id, str(lane), read_indices[read_index])
fastq_names.append(new_file_name)
fastq_urls.append(url)

with open('fastqs.txt', 'w') as f:
for url in fastq_urls:
f.write(url + '\n')

with open('fastq_names.txt', 'w') as f:
for name in fastq_names:
f.write(name + '\n')

print('Finished writing files')
Loading

0 comments on commit c1df3ec

Please sign in to comment.