Handle 10x data with no I1 file (#94)

* Handle 10x data with no I1 file * Fix fastq file names * Add test for getting cellranger inputs * Rename cellranger inputs function * Update pipeline-tools docker tag
HumanCellAtlas · Oct 22, 2018 · c1df3ec · c1df3ec
1 parent 231a2e8
commit c1df3ec
Show file tree

Hide file tree

Showing 5 changed files with 932 additions and 49 deletions.
diff --git a/adapter_pipelines/cellranger/adapter.wdl b/adapter_pipelines/cellranger/adapter.wdl
@@ -24,7 +24,7 @@ task GetInputs {
     python -u <<CODE
     from pipeline_tools import input_utils
 
-    input_utils.create_optimus_input_tsv(
+    input_utils.get_cellranger_input_files(
                     "${bundle_uuid}",
                     "${bundle_version}",
                     "${dss_url}")
@@ -36,38 +36,13 @@ task GetInputs {
   }
   output {
     String sample_id = read_string("sample_id.txt")
-    Array[File] r1_fastq = read_lines("r1.txt")
-    Array[File] r2_fastq = read_lines("r2.txt")
-    Array[File] i1_fastq = read_lines("i1.txt")
-    Array[Int] lanes = read_lines("lanes.txt")
+    Array[File] fastqs = read_lines("fastqs.txt")
+    Array[String] fastq_names = read_lines("fastq_names.txt")
     Array[File] http_requests = glob("request_*.txt")
     Array[File] http_responses = glob("response_*.txt")
   }
 }
 
-task RenameFastqFiles {
-    File r1
-    File r2
-    File i1
-    String sample_id
-    String lane
-    String pipeline_tools_version
-
-    command <<<
-      mv ${r1} '${sample_id}_S1_L00${lane}_R1_001.fastq.gz'
-      mv ${r2} '${sample_id}_S1_L00${lane}_R2_001.fastq.gz'
-      mv ${i1} '${sample_id}_S1_L00${lane}_I1_001.fastq.gz'
-      >>>
-      runtime {
-        docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
-      }
-      output {
-        File r1_new = "${sample_id}_S1_L00${lane}_R1_001.fastq.gz"
-        File r2_new = "${sample_id}_S1_L00${lane}_R2_001.fastq.gz"
-        File i1_new = "${sample_id}_S1_L00${lane}_I1_001.fastq.gz"
-      }
-}
-
 task RenameFiles {
     Array[File] file_paths
     Array[String] new_file_names
@@ -175,7 +150,7 @@ workflow Adapter10xCount {
   Int max_cromwell_retries = 0
   Boolean add_md5s = false
 
-  String pipeline_tools_version = "v0.36.0"
+  String pipeline_tools_version = "v0.37.0"
 
   call GetInputs {
     input:
@@ -190,39 +165,29 @@ workflow Adapter10xCount {
       pipeline_tools_version = pipeline_tools_version
   }
 
-  # Cellranger code in 10x count wdl requires files to be named a certain way.
-  # To accommodate that, RenameFastqFiles copies the blue box files into the
-  # cromwell execution bucket but with the names cellranger expects.
-  # Putting this in its own task lets us take advantage of automatic localizing
-  # and delocalizing by Cromwell/JES to actually read and write stuff in buckets.
-  # TODO: Replace scatter with a for-loop inside of the task to avoid creating a
-  # VM for each set of files that needs to be renamed
-  scatter(i in range(length(GetInputs.lanes))) {
-    call RenameFastqFiles as prep {
-      input:
-        r1 = GetInputs.r1_fastq[i],
-        r2 = GetInputs.r2_fastq[i],
-        i1 = GetInputs.i1_fastq[i],
-        sample_id = GetInputs.sample_id,
-        lane = GetInputs.lanes[i],
-        pipeline_tools_version = pipeline_tools_version
-      }
-    }
+  # Rename the fastq files to the format required by CellRanger:
+  #  {sample_id}_S1_L001_R1_001.fastq.gz'
+  call RenameFiles as rename_fastqs {
+    input:
+      file_paths = GetInputs.fastqs,
+      new_file_names = GetInputs.fastq_names,
+      pipeline_tools_version = pipeline_tools_version
+  }
 
   # CellRanger gets the paths to the fastq directories from the array of fastqs,
   # so the order of those files does not matter
   call CellRanger.CellRanger as analysis {
     input:
       sample_id = GetInputs.sample_id,
-      fastqs = flatten([prep.r1_new, prep.r2_new, prep.i1_new]),
+      fastqs = rename_fastqs.outputs,
       reference_name = reference_name,
       transcriptome_tar_gz = transcriptome_tar_gz,
       expect_cells = expect_cells
   }
 
   call InputsForSubmit {
     input:
-      fastqs = flatten([GetInputs.r1_fastq, GetInputs.r2_fastq, GetInputs.i1_fastq]),
+      fastqs = GetInputs.fastqs,
       other_inputs = [
         {
           "name": "sample_id",

diff --git a/pipeline_tools/input_utils.py b/pipeline_tools/input_utils.py
@@ -1,3 +1,4 @@
+import os
 import functools
 import typing
 from concurrent.futures import ThreadPoolExecutor
@@ -167,6 +168,7 @@ def create_ss2_input_tsv(bundle_uuid, bundle_version, dss_url, input_tsv_name='i
     print('Wrote input map to disk.')
 
 
+# TODO: Rename this function since it no longer creates a tsv file
 def create_optimus_input_tsv(uuid, version, dss_url):
     """Create TSV of Optimus inputs
 
@@ -218,3 +220,59 @@ def create_optimus_input_tsv(uuid, version, dss_url):
         f.write('{0}'.format(sample_id))
 
     print('Finished writing files')
+
+
+def get_cellranger_input_files(uuid, version, dss_url):
+    """ Get inputs for cellranger count workflow
+
+    Args:
+        uuid (str): the bundle uuid
+        version (str): the bundle version
+        dss_url (str): the DCP Data Storage Service
+
+    Returns:
+        None
+
+    Raises:
+        optimus_utils.LaneMissingFileError: if any fastqs are missing
+    """
+    # Get bundle manifest
+    print('Getting bundle manifest for id {0}, version {1}'.format(uuid, version))
+    primary_bundle = get_bundle_metadata(uuid=uuid, version=version, dss_url=dss_url, http_requests=HttpRequests())
+
+    sample_id = get_sample_id(primary_bundle)
+    print('Writing sample ID to sample_id.txt')
+    with open('sample_id.txt', 'w') as f:
+        f.write('{0}'.format(sample_id))
+
+    # Parse inputs from metadata
+    print('Gathering fastq inputs')
+    fastq_files = [f for f in primary_bundle.files.values() if f.file_format == 'fastq.gz']
+    lane_to_fastqs = optimus_utils.create_fastq_dict(fastq_files)
+
+    # Stop if any fastqs are missing
+    optimus_utils.validate_lanes(lane_to_fastqs)
+
+    read_indices = {
+        'read1': 'R1',
+        'read2': 'R2',
+        'index1': 'I1'
+    }
+    fastq_urls = []
+    fastq_names = []
+
+    for lane, reads in lane_to_fastqs.items():
+        for read_index, url in reads.items():
+            new_file_name = '{}_S1_L00{}_{}_001.fastq.gz'.format(sample_id, str(lane), read_indices[read_index])
+            fastq_names.append(new_file_name)
+            fastq_urls.append(url)
+
+    with open('fastqs.txt', 'w') as f:
+        for url in fastq_urls:
+            f.write(url + '\n')
+
+    with open('fastq_names.txt', 'w') as f:
+        for name in fastq_names:
+            f.write(name + '\n')
+
+    print('Finished writing files')