GL-229: Add .loom support to Optimus (#253)

* add loom to optimus * add loom test * update optimus diagram * update documentation
HumanCellAtlas · Sep 9, 2019 · 84bc10d · 84bc10d
1 parent 7ac52e8
commit 84bc10d
Show file tree

Hide file tree

Showing 13 changed files with 371 additions and 47 deletions.
diff --git a/docker/zarr-to-loom/Dockerfile b/docker/zarr-to-loom/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.7.4
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+RUN mkdir /tools
+COPY optimus_zarr_to_loom.py /tools/
+COPY unpackZARR.sh /tools/
+ENV PATH=${PATH}:/tools/
diff --git a/docker/zarr-to-loom/build.sh b/docker/zarr-to-loom/build.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+tag=$1
+
+if [ -z $tag ]; then
+    echo -e "\nYou must provide a tag"
+    echo -e "\nUsage: bash build_docker.sh TAG\n"
+    exit 1
+fi
+
+
+docker build . -t quay.io/humancellatlas/zarr-to-loom:$tag
+echo "You can now push with docker push quay.io/humancellatlas/zarr-to-loom:$tag"
diff --git a/docker/zarr-to-loom/optimus_zarr_to_loom.py b/docker/zarr-to-loom/optimus_zarr_to_loom.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+import zarr
+import loompy
+import scipy as sc
+from scipy.sparse import coo_matrix
+import argparse
+import numpy as np
+
+
+# Custom Exception
+class UnexpectedInputFormat(Exception):
+    pass
+
+
+def main():
+    # Parse the arguments
+    description = """This script converts ZARR optimus output to loom format"""
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument('--input-zarr', dest="input_zarr_path", required=True, help="Path to input ZARR file")
+    parser.add_argument('--output-loom', dest="output_loom_path", required=True, help="Path to output loom file")
+    parser.add_argument('--sample-id', dest="sample_id", required=True, help="Sample identifier")
+    args = parser.parse_args()
+
+    input_zarr_path = args.input_zarr_path
+    output_loom_path = args.output_loom_path
+    sample_id = args.sample_id
+
+    # Checks on inputs
+    if not os.path.isdir(input_zarr_path):
+        sys.exit("Error: the input ZARR path is not a directory.")
+    if os.path.exists(output_loom_path):
+        sys.exit("Error: The output loom file exists!")
+
+    # Open the ZARR
+    store = zarr.DirectoryStore(input_zarr_path)
+    root = zarr.open(store)
+
+    # Get the expression matrix
+    # expression matrix in numpy ndarray format (dense)
+    # NOTE: If memory is limiting this could be done by chunk
+    nrows = root[f"/{sample_id}.zarr/expression"].shape[0]
+    ncols = root[f"/{sample_id}.zarr/expression"].shape[1]
+
+    expr_sp = sc.sparse.coo_matrix((nrows, ncols), np.float32)
+
+    iter_row_count = 100;
+
+    xcoord = []
+    ycoord = []
+    value = []
+
+    chunk_row_size = 10000
+    chunk_col_size = 10000
+
+    for i in range(0, nrows, chunk_row_size):
+        for j in range(0, ncols, chunk_col_size):
+            p = chunk_row_size
+            if i + chunk_row_size > nrows:
+                p = nrows - 1
+            q = chunk_col_size
+            if j + chunk_col_size > ncols:
+                q = ncols - j
+            expr_sub_row_coo = sc.sparse.coo_matrix(root[f"/{sample_id}.zarr/expression"][i:(i+p), j:(j+q)])
+            for k in range(0, expr_sub_row_coo.data.shape[0]):
+                xcoord.append(expr_sub_row_coo.row[k] + i)
+                ycoord.append(expr_sub_row_coo.col[k] + j)
+                value.append(expr_sub_row_coo.data[k])
+
+    xcoord = np.asarray(xcoord)
+    ycoord = np.asarray(ycoord)
+    value = np.asarray(value)
+
+    expr_sp_t = sc.sparse.coo_matrix((value, (ycoord, xcoord)), shape=(expr_sp.shape[1], expr_sp.shape[0]))
+
+    del xcoord
+    del ycoord
+    del value
+
+    # ROW/GENE Metadata
+
+    # Check that the first gene metadata column is the gene name as expected
+    if not (root[f"/{sample_id}.zarr/gene_metadata_string_name"][:] == 'gene_name')[0]:
+        raise UnexpectedInputFormat("The first gene metadata item is not the gene_name");
+
+    # Prepare row attributes (gene attributes)
+    # Follow loom file Conventions
+    row_attrs = {
+        "Gene": root[f"/{sample_id}.zarr/gene_metadata_string"][:][0,],
+        "Accession": root[f"/{sample_id}.zarr/gene_id"][:]}
+
+    numeric_field_names = root[f"/{sample_id}.zarr/gene_metadata_numeric_name"][:]
+
+    # Generate with a list
+    for i in range(0, numeric_field_names.shape[0]):
+        name = numeric_field_names[i]
+        data = root[f"/{sample_id}.zarr/gene_metadata_numeric"][:][:, i]
+        row_attrs[name] = data
+
+    # COLUMN/CELL Metadata
+    col_attrs = dict()
+    col_attrs["CellID"] = root[f"/{sample_id}.zarr/cell_id"][:]
+    bool_field_names = root[f"/{sample_id}.zarr/cell_metadata_bool_name"][:]
+
+    for i in range(0, bool_field_names.shape[0]):
+        name = bool_field_names[i]
+        data = root[f"/{sample_id}.zarr/cell_metadata_bool"][:][:, i]
+        col_attrs[name] = data
+
+    float_field_names = root[f"/{sample_id}.zarr/cell_metadata_float_name"][:]
+
+    def add_to_cell_meta_float_by_index(i):
+        name = float_field_names[i]
+        data = root[f"/{sample_id}.zarr/cell_metadata_float"][:][:, i]
+        col_attrs[name] = data
+
+    [add_to_cell_meta_float_by_index(i) for i in range(0, float_field_names.shape[0])]
+
+    # Generate the loom file
+    loompy.create(output_loom_path, expr_sp_t, row_attrs, col_attrs)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docker/zarr-to-loom/requirements.txt b/docker/zarr-to-loom/requirements.txt
@@ -0,0 +1,13 @@
+asciitree==0.3.3
+fasteners==0.15
+h5py==2.9.0
+loompy==2.0.17
+monotonic==1.5
+numcodecs==0.6.3
+numpy==1.17.1
+pandas==0.25.1
+python-dateutil==2.8.0
+pytz==2019.2
+scipy==1.3.1
+six==1.12.0
+zarr==2.3.2
diff --git a/docker/zarr-to-loom/unpackZARR.sh b/docker/zarr-to-loom/unpackZARR.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+# Usage function
+function usage() {
+  echo "Usage: $0 -i input_directory -o output_directory"
+}
+
+# Read the CLI arguments
+while getopts ":i:o:h" opt; do
+  case "${opt}" in
+    i)
+      inputDir=$OPTARG
+      ;;
+    o)
+      outputDir=$OPTARG
+      ;;
+    h)
+      usage
+      exit 0;
+      ;;
+    *)
+      usage
+      exit 1;
+      ;;
+    esac
+done
+shift $((OPTIND -1))
+
+# Check the inputs
+if [ -z "$inputDir" ]; then
+  echo "Error: input directory (-i) not specified"
+  exit 1;
+fi
+if [ -z "$outputDir" ]; then
+  echo "Error: output directory (-o) not specified"
+  exit 1;
+fi
+if [ ! -d "$inputDir" ]; then
+  echo "Error the input directory path is not a directory";
+  exit 1;
+fi
+if [ ! -d "$outputDir" ]; then
+  echo "Error; the output directory path is not a directory";
+  exit;
+fi
+
+# Do the conversion
+while IFS= read -r -d '' f
+do
+  bname1=$(basename "${f}")
+  rel_final_path=$(echo "${bname1}" | tr '!' '/')
+  rel_final_dirname=$(dirname "${rel_final_path}")
+  rel_final_bname=$(basename "${rel_final_path}")
+  mkdir -p "${outputDir}/${rel_final_dirname}"
+  cp "${f}" "${outputDir}/${rel_final_dirname}/${rel_final_bname}"
+done <   <(find "${inputDir}" -type f -print0 )
diff --git a/library/tasks/ZarrUtils.wdl b/library/tasks/ZarrUtils.wdl
@@ -128,3 +128,45 @@ task OptimusZarrConversion {
   }
 }
 
+task OptimusZarrToLoom {
+    String sample_id
+    Array[File] zarr_files
+
+    # runtime values
+    String docker = "quay.io/humancellatlas/zarr-to-loom:0.0.1"
+
+    Int preemptible = 3
+    Int cpu = 1
+
+    meta {
+         description: "This task converts the Optimus Zarr output into a loom file"
+    }
+
+    parameter_meta {
+        machine_mem_mb: "(optional) the amount of memory in (MiB) to provision for this task"
+        cpu: "(optional) the number of cpus to provision for this task"
+        preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non-preemptible machine"
+    }
+
+    command {
+        set -euo pipefail
+
+        mkdir packed_zarr
+        mv ${sep=' ' zarr_files} packed_zarr/
+        mkdir unpacked_zarr
+        unpackZARR.sh -i packed_zarr -o unpacked_zarr
+        optimus_zarr_to_loom.py --input-zarr unpacked_zarr --output-loom output.loom --sample-id ${sample_id}
+    }
+
+    runtime {
+        docker: docker
+        cpu: 1
+        memory: "16 GiB"
+        disks: "local-disk 100 HDD"
+        preemptible: preemptible
+    }
+
+    output {
+        File loom_output = "output.loom"
+    }
+}
diff --git a/pipelines/optimus/Optimus.wdl b/pipelines/optimus/Optimus.wdl
@@ -40,8 +40,8 @@ workflow Optimus {
   # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays
   Array[Int] indices = range(length(r1_fastq))
 
-  # whether to convert the outputs to Zarr format, by default it's set to true
-  Boolean output_zarr = true
+  # If true produce the optional loom output
+  Boolean output_loom = false
 
   # this pipeline does not set any preemptible varibles and only relies on the task-level preemptible settings
   # you could override the tasklevel preemptible settings by passing it as one of the workflows inputs
@@ -208,33 +208,42 @@ workflow Optimus {
       col_index = MergeCountFiles.col_index
   }
 
-  if (output_zarr) {
-    call ZarrUtils.OptimusZarrConversion{
+  call ZarrUtils.OptimusZarrConversion{
+    input:
+      sample_id = sample_id,
+      annotation_file = annotations_gtf,
+      cell_metrics = MergeCellMetrics.cell_metrics,
+      gene_metrics = MergeGeneMetrics.gene_metrics,
+      sparse_count_matrix = MergeCountFiles.sparse_count_matrix,
+      cell_id = MergeCountFiles.row_index,
+      gene_id = MergeCountFiles.col_index,
+      empty_drops_result = RunEmptyDrops.empty_drops_result
+  }
+
+  if (output_loom) {
+    call ZarrUtils.OptimusZarrToLoom {
       input:
-        sample_id=sample_id,
-        annotation_file=annotations_gtf,
-        cell_metrics = MergeCellMetrics.cell_metrics,
-        gene_metrics = MergeGeneMetrics.gene_metrics,
-        sparse_count_matrix = MergeCountFiles.sparse_count_matrix,
-        cell_id = MergeCountFiles.row_index,
-        gene_id = MergeCountFiles.col_index,
-        empty_drops_result = RunEmptyDrops.empty_drops_result
+        sample_id = sample_id,
+        zarr_files = OptimusZarrConversion.zarr_output_files
     }
-   }
+  }
 
   output {
-      # version of this pipeline
-      String pipeline_version = version
-
-      File bam = MergeSorted.output_bam
-      File matrix = MergeCountFiles.sparse_count_matrix
-      File matrix_row_index = MergeCountFiles.row_index
-      File matrix_col_index = MergeCountFiles.col_index
-      File cell_metrics = MergeCellMetrics.cell_metrics
-      File gene_metrics = MergeGeneMetrics.gene_metrics
-      File cell_calls = RunEmptyDrops.empty_drops_result
-
-      # zarr
-      Array[File]? zarr_output_files = OptimusZarrConversion.zarr_output_files
+    # version of this pipeline
+    String pipeline_version = version
+
+    File bam = MergeSorted.output_bam
+    File matrix = MergeCountFiles.sparse_count_matrix
+    File matrix_row_index = MergeCountFiles.row_index
+    File matrix_col_index = MergeCountFiles.col_index
+    File cell_metrics = MergeCellMetrics.cell_metrics
+    File gene_metrics = MergeGeneMetrics.gene_metrics
+    File cell_calls = RunEmptyDrops.empty_drops_result
+
+    # zarr
+    Array[File] zarr_output_files = OptimusZarrConversion.zarr_output_files
+
+    # loom
+    File? loom_output_file = OptimusZarrToLoom.loom_output
   }
 }
diff --git a/pipelines/optimus/Optimus_diagram.png b/pipelines/optimus/Optimus_diagram.png
diff --git a/pipelines/optimus/PublicDocumentation.md b/pipelines/optimus/PublicDocumentation.md
@@ -26,7 +26,7 @@ The bead-specific barcodes and UMIs are encoded on sequencing primers that also
 | Aligner           |STAR       |[Dobin, et al.,2013](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/)|
 | Transcript Quantification |Utilities for processing large-scale single cell datasets |[Sctools](https://github.com/HumanCellAtlas/sctools) |                      
 |Data Input File Format |File format in which sequencing data is provided |[FASTQ](https://academic.oup.com/nar/article/38/6/1767/3112533) |                       
-|Data Output File Format |File formats in which Optimus output is provided |[BAM](http://samtools.github.io/hts-specs/), [Zarr version 2](https://zarr.readthedocs.io/en/stable/spec/v2.html), Python numpy arrays |
+|Data Output File Format |File formats in which Optimus output is provided |[BAM](http://samtools.github.io/hts-specs/), [Zarr version 2](https://zarr.readthedocs.io/en/stable/spec/v2.html), Python numpy arrays (internal), [loom](http://loompy.org/) |
 
 ## Optimus Modules Summary
 

diff --git a/pipelines/optimus/README.md b/pipelines/optimus/README.md
@@ -37,6 +37,7 @@ Following are the the types of files produced from the pipeline.
 | gene_metrics | merged-gene-metrics.csv.gz | gene metrics | compressed csv | Matrix of metrics by genes | Yes| sctools |
 | cell_calls | empty_drops_result.csv | cell calls | csv | | Yes | emptyDrops |
 | zarr_output_files | {unique_id}.zarr!.zattrs | | zarr store? sparse matrix? | | Yes | | 
+| loom_output_file | output.loom | Loom | Loom | Loom file with expression data and metadata | N/A | N/A |
 
 ### Components of Optimus
 The source code is available from [Github](https://github.com/HumanCellAtlas/skylab/blob/master/pipelines/optimus/Optimus.wdl), an overview of the pipeline can be found on the [HCA Data Portal](https://prod.data.humancellatlas.org/) and the benchmarking that was performed on the pipeline can be found [here](https://docs.google.com/document/d/158ba_xQM9AYyu8VcLWsIvSoEYps6PQhgddTr9H0BFmY/edit#heading=h.calfpviouwbg). Some of the tasks in Optimus use the [sctools](https://github.com/HumanCellAtlas/sctools) library of utilities for large scale distributed single cell data processing, and [Picard](https://broadinstitute.github.io/picard/) tools, a set of command line tools for manipulating high-throughput sequencing data in formats such as SAM/BAM/CRAM and VCF.