Merge pull request #224 from HumanCellAtlas/optimus_gene_id_fix

added a fix for optimus to use gene_ids instaed of gene names
HumanCellAtlas · Jun 19, 2019 · 5e5e2b8 · 5e5e2b8
2 parents 31e6134 + 04e8fb5
commit 5e5e2b8
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 4 deletions.
diff --git a/docker/dropseq_tools/Dockerfile b/docker/dropseq_tools/Dockerfile
@@ -10,6 +10,8 @@ RUN apt update && apt install -y \
   curl \
   unzip
 
+RUN  apt install -y python
+
 RUN curl http://mccarrolllab.com/download/922/ >> Drop-seq_tools-1.12.zip && \
   unzip Drop-seq_tools-1.12.zip && \
   cp -r Drop-seq_tools-1.12/* /usr/local/bin/
diff --git a/library/tasks/CreateCountMatrix.wdl b/library/tasks/CreateCountMatrix.wdl
@@ -80,10 +80,35 @@ task CreateSparseCountMatrix {
   command {
     set -e
 
+    if file --mime-type "${gtf_file}" | grep  gzip; then
+       gunzip -c  "${gtf_file}" > input.gtf
+    else
+        mv "${gtf_file}"  input.gtf
+    fi
+
+    python -u <<CODE
+    import re
+
+    in_gtf = "input.gtf"
+    out_gtf = "gene_id_as_gene_name.gtf"
+
+    with open(in_gtf, 'r') as fpin, open(out_gtf, 'w') as fout:
+         for _line in fpin:
+             line = _line.strip()
+             gene_id = re.search(r'gene_id ([^;]*);', line)
+             gene_name = re.search(r'gene_name ([^;]*);', line)
+             if gene_id and gene_name:
+                 modified_line = re.sub(r'gene_name ([^;]*);', 'gene_name ' + gene_id.group(1) + ";", line)
+                 fout.write(modified_line + '\n')
+             else:
+                 fout.write(line + '\n')
+
+    CODE
+
     CreateCountMatrix \
       --bam-file ${bam_input} \
       --output-prefix sparse_counts \
-      --gtf-annotation-file ${gtf_file} \
+      --gtf-annotation-file gene_id_as_gene_name.gtf \
       --cell-barcode-tag CB \
       --molecule-barcode-tag UB \
       --gene-id-tag GE

diff --git a/library/tasks/TagGeneExon.wdl b/library/tasks/TagGeneExon.wdl
@@ -3,10 +3,10 @@ task TagGeneExon {
   File bam_input
 
   # runtime values
-  String docker = "quay.io/humancellatlas/secondary-analysis-dropseqtools:v0.2.2-1.12"
+  String docker = "quay.io/humancellatlas/secondary-analysis-dropseqtools:v0.2.2-1.13"
   Int machine_mem_mb = 8250
   Int cpu = 1
-  Int disk = ceil((size(bam_input, "Gi") + size(annotations_gtf, "Gi")) * 3)
+  Int disk = ceil((size(bam_input, "Gi") + size(annotations_gtf, "Gi")) * 3) + 20
   Int preemptible = 3
 
   meta {
@@ -26,12 +26,36 @@ task TagGeneExon {
  command {
     set -e
 
+    if file --mime-type "${annotations_gtf}" | grep  gzip; then
+        gunzip -c  "${annotations_gtf}" > input.gtf
+    else
+        mv "${annotations_gtf}"  input.gtf
+    fi
+
+    python -u <<CODE
+    import re
+
+    in_gtf = "input.gtf"
+    out_gtf = "gene_id_as_gene_name.gtf"
+    with open(in_gtf, 'r') as fpin, open(out_gtf, 'w') as fout:
+        for _line in fpin:
+             line = _line.strip()
+             gene_id = re.search(r'gene_id ([^;]*);', line)
+             gene_name = re.search(r'gene_name ([^;]*);', line)
+             if gene_id and gene_name:
+                 modified_line = re.sub(r'gene_name ([^;]*);', 'gene_name ' + gene_id.group(1) + ";", line)
+                 fout.write(modified_line + '\n')
+             else:
+                 fout.write(line + '\n')
+
+    CODE
+
     TagReadWithGeneExon \
       INPUT=${bam_input} \
       OUTPUT=bam_with_gene_exon.bam \
       SUMMARY=gene_exon_tag_summary.log \
       TAG=GE \
-      ANNOTATIONS_FILE=${annotations_gtf}
+      ANNOTATIONS_FILE=gene_id_as_gene_name.gtf
   }
 
   # Larger genomes (mouse-human) require a 7.5gb instance; single-organism genomes work with 3.75gb