-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAlignAndCall.wdl
executable file
·336 lines (297 loc) · 10.6 KB
/
AlignAndCall.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import "https://github.com/gevro/Mitochondria-Pipeline/raw/master/AlignAndMarkDuplicates.wdl" as AlignAndMarkDuplicates
import "https://github.com/gevro/Mitochondria-Pipeline/raw/master/MutectAndFilter.wdl" as MutectAndFilter
workflow AlignAndCall {
meta {
description: "Takes in unmapped bam and outputs VCF of SNP/Indel calls on the mitochondria."
}
parameter_meta {
unmapped_bam: "Unmapped and subset bam, optionally with original alignment (OA) tag"
}
File unmapped_bam
Int? autosomal_coverage
File mt_dict
File mt_fasta
File mt_fasta_index
File mt_amb
File mt_ann
File mt_bwt
File mt_pac
File mt_sa
File blacklisted_sites
File blacklisted_sites_index
#Shifted reference is used for calling the control region (edge of mitochondria reference).
#This solves the problem that BWA doesn't support alignment to circular contigs.
File mt_shifted_dict
File mt_shifted_fasta
File mt_shifted_fasta_index
File mt_shifted_amb
File mt_shifted_ann
File mt_shifted_bwt
File mt_shifted_pac
File mt_shifted_sa
File blacklisted_sites_shifted
File blacklisted_sites_shifted_index
File shift_back_chain
File? gatk_override
String? m2_extra_args
# Using an older version of the default Mutect LOD cutoff. This value can be changed and is only useful at low depths
# to catch sites that would not get caught by the LOD divided by depth filter.
Float lod_cutoff = 6.3
# Read length used for optimization only. If this is too small CollectWgsMetrics might fail, but the results are not
# affected by this number. Default is 151.
Int? max_read_length
#Optional runtime arguments
Int? preemptible_tries
call AlignAndMarkDuplicates.AlignmentPipeline as AlignToMt {
input:
input_bam = unmapped_bam,
mt_dict = mt_dict,
mt_fasta = mt_fasta,
mt_fasta_index = mt_fasta_index,
mt_amb = mt_amb,
mt_ann = mt_ann,
mt_bwt = mt_bwt,
mt_pac = mt_pac,
mt_sa = mt_sa,
preemptible_tries = preemptible_tries
}
call AlignAndMarkDuplicates.AlignmentPipeline as AlignToShiftedMt {
input:
input_bam = unmapped_bam,
mt_dict = mt_shifted_dict,
mt_fasta = mt_shifted_fasta,
mt_fasta_index = mt_shifted_fasta_index,
mt_amb = mt_shifted_amb,
mt_ann = mt_shifted_ann,
mt_bwt = mt_shifted_bwt,
mt_pac = mt_shifted_pac,
mt_sa = mt_shifted_sa,
preemptible_tries = preemptible_tries
}
call CollectWgsMetrics {
input:
input_bam = AlignToMt.mt_aligned_bam,
input_bam_index = AlignToMt.mt_aligned_bai,
ref_fasta = mt_fasta,
ref_fasta_index = mt_fasta_index,
read_length = max_read_length,
coverage_cap = 100000,
preemptible_tries = preemptible_tries
}
call GetContamination {
input:
input_bam = AlignToMt.mt_aligned_bam,
input_bam_index = AlignToMt.mt_aligned_bai,
}
call MutectAndFilter.MitochondriaCalling as CallAndFilterMt {
input:
input_bam = AlignToMt.mt_aligned_bam,
input_bam_index = AlignToMt.mt_aligned_bai,
ref_fasta = mt_fasta,
ref_fasta_index = mt_fasta_index,
ref_dict = mt_dict,
lod_cutoff = lod_cutoff,
gatk_override = gatk_override,
# Everything is called except the control region.
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:576-16024 ",
blacklisted_sites = blacklisted_sites,
blacklisted_sites_index = blacklisted_sites_index,
mean_coverage = CollectWgsMetrics.mean_coverage,
autosomal_coverage = autosomal_coverage,
contamination = GetContamination.minor_level,
max_read_length = max_read_length,
preemptible_tries = preemptible_tries
}
call MutectAndFilter.MitochondriaCalling as CallAndFilterShiftedMt {
input:
input_bam = AlignToShiftedMt.mt_aligned_bam,
input_bam_index = AlignToShiftedMt.mt_aligned_bai,
ref_fasta = mt_shifted_fasta,
ref_fasta_index = mt_shifted_fasta_index,
ref_dict = mt_shifted_dict,
lod_cutoff = lod_cutoff,
gatk_override = gatk_override,
# Interval correspondes to control region in the shifted reference
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:8025-9144 ",
blacklisted_sites = blacklisted_sites_shifted,
blacklisted_sites_index = blacklisted_sites_shifted_index,
mean_coverage = CollectWgsMetrics.mean_coverage,
autosomal_coverage = autosomal_coverage,
contamination = GetContamination.minor_level,
max_read_length = max_read_length,
preemptible_tries = preemptible_tries
}
call LiftoverAndCombineVcfs {
input:
shifted_vcf = CallAndFilterShiftedMt.vcf,
vcf = CallAndFilterMt.vcf,
ref_fasta = mt_fasta,
ref_fasta_index = mt_fasta_index,
ref_dict = mt_dict,
shift_back_chain = shift_back_chain,
preemptible_tries = preemptible_tries
}
output {
File mt_aligned_bam = AlignToMt.mt_aligned_bam
File mt_aligned_bai = AlignToMt.mt_aligned_bai
File mt_aligned_shifted_bam = AlignToShiftedMt.mt_aligned_bam
File mt_aligned_shifted_bai = AlignToShiftedMt.mt_aligned_bai
File out_vcf = LiftoverAndCombineVcfs.final_vcf
File out_vcf_index = LiftoverAndCombineVcfs.final_vcf_index
File duplicate_metrics = AlignToMt.duplicate_metrics
File coverage_metrics = CollectWgsMetrics.metrics
File theoretical_sensitivity_metrics = CollectWgsMetrics.theoretical_sensitivity
File contamination_metrics = GetContamination.contamination_file
Int mean_coverage = CollectWgsMetrics.mean_coverage
String major_haplogroup = GetContamination.major_hg
Float contamination = GetContamination.minor_level
}
}
task GetContamination {
File input_bam
File input_bam_index
Int qual = 20
Int map_qual = 30
String basename = basename(input_bam)
# runtime
Int disk_size = ceil(size(input_bam, "GB")) + 20
meta {
description: "Uses Haplochecker to estimate levels of contamination in mitochondria"
}
parameter_meta {
input_bam: "Bam aligned to chrM"
}
command {
set -e
/cloudgene run [email protected] \
--files ${input_bam} \
--baseQ ${qual} \
--mapQ ${map_qual} \
--output haplochecker_out
mv /haplochecker_out/contamination/contamination.txt ${basename}.contamination.txt
python <<CODE
import csv
with open("${basename}.contamination.txt") as output:
reader = csv.DictReader(output, delimiter='\t')
for row in reader:
print >>open("major_hg.txt", 'w'),row["Major Haplogroup"]
print >>open("minor_hg.txt", 'w'),row["Minor Haplogroup"]
if row["Major Heteroplasmy Level"] == "n/a":
print >>open("major_level.txt", 'w'),"1"
else:
print >>open("major_level.txt", 'w'),row["Major Heteroplasmy Level"]
if row["Minor Heteroplasmy Level"] == "n/a":
print >>open("minor_level.txt", 'w'),"0"
else:
print >>open("minor_level.txt", 'w'),row["Minor Heteroplasmy Level"]
CODE
}
runtime {
preemptible: 0
memory: "8 GB"
disks: "local-disk " + disk_size + " HDD"
docker: "gevrony/haplocheck:latest"
}
output {
File contamination_file = "${basename}.contamination.txt"
String major_hg = read_string("major_hg.txt")
Float major_level = read_float("major_level.txt")
String minor_hg = read_string("minor_hg.txt")
Float minor_level = read_float("minor_level.txt")
}
}
task CollectWgsMetrics {
File input_bam
File input_bam_index
File ref_fasta
File ref_fasta_index
Int? read_length
Int read_length_for_optimization = select_first([read_length, 151])
Int? coverage_cap
String output_basename = basename("${input_bam}",".bam")
Int? preemptible_tries
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB")
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20
meta {
description: "Collect coverage metrics"
}
parameter_meta {
read_length: "Read length used for optimization only. If this is too small CollectWgsMetrics might fail. Default is 151."
}
command <<<
set -e
java -Xms2000m -jar /usr/gitc/picard.jar \
CollectWgsMetrics \
INPUT=${input_bam} \
VALIDATION_STRINGENCY=SILENT \
REFERENCE_SEQUENCE=${ref_fasta} \
OUTPUT=${output_basename}.metrics.txt \
USE_FAST_ALGORITHM=true \
READ_LENGTH=${read_length_for_optimization} \
${"COVERAGE_CAP=" + coverage_cap} \
INCLUDE_BQ_HISTOGRAM=true \
THEORETICAL_SENSITIVITY_OUTPUT=${output_basename}.theoretical_sensitivity.txt
R --vanilla <<CODE
df = read.table("${output_basename}.metrics.txt",skip=6,header=TRUE,stringsAsFactors=FALSE,sep='\t',nrows=1)
write.table(floor(df[,"MEAN_COVERAGE"]), "mean_coverage.txt", quote=F, col.names=F, row.names=F)
CODE
>>>
runtime {
preemptible: select_first([preemptible_tries, 5])
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
}
output {
File metrics = "${output_basename}.metrics.txt"
File theoretical_sensitivity = "${output_basename}.theoretical_sensitivity.txt"
Int mean_coverage = read_int("mean_coverage.txt")
}
}
task LiftoverAndCombineVcfs {
File shifted_vcf
File vcf
String basename = basename(shifted_vcf, ".vcf")
File ref_fasta
File ref_fasta_index
File ref_dict
File shift_back_chain
# runtime
Int? preemptible_tries
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB")
Int disk_size = ceil(size(shifted_vcf, "GB") + ref_size) + 20
meta {
description: "Lifts over shifted vcf of control region and combines it with the rest of the chrM calls."
}
parameter_meta {
shifted_vcf: "VCF of control region on shifted reference"
vcf: "VCF of the rest of chrM on original reference"
ref_fasta: "Original (not shifted) chrM reference"
shift_back_chain: "Chain file to lift over from shifted reference to original chrM"
}
command<<<
set -e
java -jar /usr/gitc/picard.jar LiftoverVcf \
I=${shifted_vcf} \
O=${basename}.shifted_back.vcf \
R=${ref_fasta} \
CHAIN=${shift_back_chain} \
REJECT=${basename}.rejected.vcf
java -jar /usr/gitc/picard.jar MergeVcfs \
I=${basename}.shifted_back.vcf \
I=${vcf} \
O=${basename}.final.vcf
>>>
runtime {
disks: "local-disk " + disk_size + " HDD"
memory: "1200 MB"
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
preemptible: select_first([preemptible_tries, 5])
}
output{
# rejected_vcf should always be empty
File rejected_vcf = "${basename}.rejected.vcf"
File final_vcf = "${basename}.final.vcf"
File final_vcf_index = "${basename}.final.vcf.idx"
}
}