Skip to content

Commit

Permalink
Merge pull request #224 from HumanCellAtlas/optimus_gene_id_fix
Browse files Browse the repository at this point in the history
added a fix for optimus to use gene_ids instaed of gene names
  • Loading branch information
rhiananthony authored Jun 19, 2019
2 parents 31e6134 + 04e8fb5 commit 5e5e2b8
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
2 changes: 2 additions & 0 deletions docker/dropseq_tools/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ RUN apt update && apt install -y \
curl \
unzip

RUN apt install -y python

RUN curl http://mccarrolllab.com/download/922/ >> Drop-seq_tools-1.12.zip && \
unzip Drop-seq_tools-1.12.zip && \
cp -r Drop-seq_tools-1.12/* /usr/local/bin/
27 changes: 26 additions & 1 deletion library/tasks/CreateCountMatrix.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,35 @@ task CreateSparseCountMatrix {
command {
set -e

if file --mime-type "${gtf_file}" | grep gzip; then
gunzip -c "${gtf_file}" > input.gtf
else
mv "${gtf_file}" input.gtf
fi

python -u <<CODE
import re
in_gtf = "input.gtf"
out_gtf = "gene_id_as_gene_name.gtf"
with open(in_gtf, 'r') as fpin, open(out_gtf, 'w') as fout:
for _line in fpin:
line = _line.strip()
gene_id = re.search(r'gene_id ([^;]*);', line)
gene_name = re.search(r'gene_name ([^;]*);', line)
if gene_id and gene_name:
modified_line = re.sub(r'gene_name ([^;]*);', 'gene_name ' + gene_id.group(1) + ";", line)
fout.write(modified_line + '\n')
else:
fout.write(line + '\n')
CODE
CreateCountMatrix \
--bam-file ${bam_input} \
--output-prefix sparse_counts \
--gtf-annotation-file ${gtf_file} \
--gtf-annotation-file gene_id_as_gene_name.gtf \
--cell-barcode-tag CB \
--molecule-barcode-tag UB \
--gene-id-tag GE
Expand Down
30 changes: 27 additions & 3 deletions library/tasks/TagGeneExon.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ task TagGeneExon {
File bam_input

# runtime values
String docker = "quay.io/humancellatlas/secondary-analysis-dropseqtools:v0.2.2-1.12"
String docker = "quay.io/humancellatlas/secondary-analysis-dropseqtools:v0.2.2-1.13"
Int machine_mem_mb = 8250
Int cpu = 1
Int disk = ceil((size(bam_input, "Gi") + size(annotations_gtf, "Gi")) * 3)
Int disk = ceil((size(bam_input, "Gi") + size(annotations_gtf, "Gi")) * 3) + 20
Int preemptible = 3

meta {
Expand All @@ -26,12 +26,36 @@ task TagGeneExon {
command {
set -e

if file --mime-type "${annotations_gtf}" | grep gzip; then
gunzip -c "${annotations_gtf}" > input.gtf
else
mv "${annotations_gtf}" input.gtf
fi

python -u <<CODE
import re
in_gtf = "input.gtf"
out_gtf = "gene_id_as_gene_name.gtf"
with open(in_gtf, 'r') as fpin, open(out_gtf, 'w') as fout:
for _line in fpin:
line = _line.strip()
gene_id = re.search(r'gene_id ([^;]*);', line)
gene_name = re.search(r'gene_name ([^;]*);', line)
if gene_id and gene_name:
modified_line = re.sub(r'gene_name ([^;]*);', 'gene_name ' + gene_id.group(1) + ";", line)
fout.write(modified_line + '\n')
else:
fout.write(line + '\n')
CODE
TagReadWithGeneExon \
INPUT=${bam_input} \
OUTPUT=bam_with_gene_exon.bam \
SUMMARY=gene_exon_tag_summary.log \
TAG=GE \
ANNOTATIONS_FILE=${annotations_gtf}
ANNOTATIONS_FILE=gene_id_as_gene_name.gtf
}
# Larger genomes (mouse-human) require a 7.5gb instance; single-organism genomes work with 3.75gb
Expand Down

0 comments on commit 5e5e2b8

Please sign in to comment.