-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathartic.sh
266 lines (234 loc) · 10.7 KB
/
artic.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/bash
#SBATCH --export=NONE
#SBATCH --mem=16G
#SBATCH --cpus-per-task 4
#SBATCH --time=48:00:00
# Generate SoG/SGE like .o and .e files
#SBATCH -o %x.o%A.%a
#SBATCH -e %x.e%A.%a
# Partition or queue to use
#SBATCH -p compute
# Needed for conda on our set-up
source ~/.bashrc
# Prevent Qt plotting issues with unattached X11 display
# No longer needed with artic v.1.2.0+, (uncomment for older versions of artic)
#DISPLAY=""
TIME=$(date)
HOST=$(hostname)
START_T=$(date +%s)
echo "Matt Bashton 2020"
echo "SLURM Bash script to run the Artic Network nCoV2 pipeline on ONT data"
echo ""
set -e
set -u
set -o pipefail
# Get our input arguments
BARCODE_MAPPING="${1}"
SEQ_SUM="${2}"
INPUT_DIR="${3}"
OUTPUT_DIR="${4}"
RUN_NAME="${5}"
# Parse barcode mapping
LINE=$(awk "NR==${SLURM_ARRAY_TASK_ID}" ${BARCODE_MAPPING})
set ${LINE}
BARCODE="${1}"
SAMPLE="${2}"
# Some defaults for ONT data
# See https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html
BASE_DIR=${PWD}
CONDA_ENV="artic-ncov2019"
CPU="4"
MIN_LEN="400"
MAX_LEN="700"
NORMALISE="200"
# This will be down to where you have artic-ncov2019 conda env installed
SCHEME_DIR="${HOME}/artic-ncov2019/primer_schemes"
# Change this lines to use V3 or V4 primers
PRIMERS="nCoV-2019/V3"
AMPLICON_BED="${SCHEME_DIR}/${PRIMERS}/nCoV-2019.insert.bed"
ANNOVAR="${HOME}/annovar/table_annovar.pl"
ANNOVAR_DB="${HOME}/annovar/sarscov2db"
# Make tmpdir for run
# The /scratch /tmp /local prefix here might be different on your system
TMPDIR=$(mktemp -d -p /local)
# List variables
echo "** Variables **"
echo " - Host: ${HOST}"
echo " - Current working directory: ${PWD}"
echo " - Time: ${TIME}"
echo " - User home dir: ${HOME}"
echo " - Job name: ${SLURM_JOB_NAME}"
echo " - Array Job ID: ${SLURM_ARRAY_JOB_ID}"
echo " - Array Task ID: ${SLURM_ARRAY_TASK_ID}"
echo " - Tmp dir: ${TMPDIR}"
echo " - Global run name: ${RUN_NAME}"
echo " - Barecode - sample ID mapping file: ${BARCODE_MAPPING}"
echo " - Sequencing summary file: ${SEQ_SUM}"
echo " - Input run dir: ${INPUT_DIR}"
echo " - Ouput dir: ${OUTPUT_DIR}"
echo " - Barcodes for this run: ${BARCODE}"
echo " - Sample ID for output: ${SAMPLE}"
echo " - Threads to run artic minion: ${CPU}"
echo " - Min length for read filtering: ${MIN_LEN}"
echo " - Max length for read filtering: ${MAX_LEN}"
echo " - Artic minion normalisation setting: ${NORMALISE}"
echo " - Primer scheme dir: ${SCHEME_DIR}"
echo " - Primer type/version: ${PRIMERS}"
echo " - Amplicons bed file: ${AMPLICON_BED}"
echo " - Annovar: ${ANNOVAR}"
echo " - Annovar DB: ${ANNOVAR_DB}"
echo ""
# Activate conda enviroment
echo "Activating Conda enviroment: ${CONDA_ENV}"
conda activate ${CONDA_ENV}
conda env list
echo -ne "Python interpreter located at: "
which python
python --version
echo ""
# cd to tmpdir dir, as output is streamed, keep it off Luster
echo "cd ing to ${TMPDIR}"
cd ${TMPDIR}
# guppyplex filtering output fastq is written to $TMPDIR, single file per barcode, this script works per barcode
echo "Running guppyplex to filter reads, min length = ${MIN_LEN}, max length = ${MAX_LEN}, "
echo "input is: ${INPUT_DIR}/fastq_pass/${BARCODE} prefixing output fastq with: ${RUN_NAME}"
READS_PRE=$(zcat ${INPUT_DIR}/fastq_pass/${BARCODE}/*.gz | awk '{s++}END{print s/4}')
echo "Total input reads pre guppyplex: ${READS_PRE}"
artic guppyplex --skip-quality-check --min-length ${MIN_LEN} --max-length ${MAX_LEN} --directory ${INPUT_DIR}/fastq_pass/${BARCODE} --prefix ${RUN_NAME}
READS_POST=$(awk '{s++}END{print s/4}' ${TMPDIR}/${RUN_NAME}_${BARCODE}.fastq)
PC_PLEX=$(echo "scale=8; (${READS_POST}/${READS_PRE})*100" | bc | xargs printf "%.2f\n")
echo "Reads post guppyplex: ${READS_POST} (${PC_PLEX}%)"
# artic minion final stage, many output / temp files written to ${TMPDIR}
echo "Running artic minion --normalise ${NORMALISE} on ${CPU} threads, fast5 dir is: ${INPUT_DIR}/fast5_pass"
artic minion --normalise ${NORMALISE} --threads ${CPU} --scheme-directory ${SCHEME_DIR} --read-file ${TMPDIR}/${RUN_NAME}_${BARCODE}.fastq --fast5-directory ${INPUT_DIR}/fast5_pass --sequencing-summary ${SEQ_SUM} ${PRIMERS} ${SAMPLE}
# Check output at this stage
echo ""
echo "Current output is:"
ls -lh
# Find number of aligned reads in ${SAMPLE}.sorted.bam
echo ""
ALN_READS=$(samtools view -c -F 2308 ${TMPDIR}/${SAMPLE}.sorted.bam)
echo "Aligned reads in ${SAMPLE}.sorted.bam: ${ALN_READS}"
# Annotate with annovar
# Work out No variants called
VAR=$(zgrep -cv '^#' ${SAMPLE}.pass.vcf.gz || true)
echo ""
echo "Number of variants called: ${VAR}"
if (( ${VAR} > 0 )); then
echo "Annotating with annovar located at: ${ANNOVAR} using annovar DB ${ANNOVAR_DB}"
# Fix ref for annovar with sed
zcat ${SAMPLE}.pass.vcf.gz | sed 's/MN908947.3/NC_045512v2/' > ${SAMPLE}.av.pass.vcf
${ANNOVAR} -buildver NC_045512v2 ${SAMPLE}.av.pass.vcf ${ANNOVAR_DB} -protocol avGene -operation g -vcfinput
echo "Variants called:"
cut -f 1-10 ${SAMPLE}.av.pass.vcf.NC_045512v2_multianno.txt | column -t
elif (( ${VAR} == 0 )); then
echo "No variants called, skipping annovar"
fi
# Final run stats
# Work out number of Ns
TOTAL_BP=$(grep -v ">" ${TMPDIR}/${SAMPLE}.consensus.fasta | tr -d "\n\r" | wc -c)
TOTAL_N=$(grep -v ">" ${TMPDIR}/${SAMPLE}.consensus.fasta | grep -o 'N' | wc -l)
PC_N=$(echo "scale=8; (${TOTAL_N}/${TOTAL_BP})*100" | bc | xargs printf "%.2f\n")
# Per amplicon depth stats
mosdepth -t ${CPU} -b $AMPLICON_BED ${SAMPLE}.inputAmp ${TMPDIR}/${SAMPLE}.sorted.bam
mosdepth -t ${CPU} -b $AMPLICON_BED ${SAMPLE}.outputAmp ${TMPDIR}/${SAMPLE}.primertrimmed.rg.sorted.bam
echo ""
echo "Minimap2 per amplicon mean coverage:"
zcat ${SAMPLE}.inputAmp.regions.bed.gz | awk '{print NR "\t" $5}'
zcat ${SAMPLE}.inputAmp.regions.bed.gz | awk '{print NR "\t" $5}' > ${SAMPLE}.inputAmp.depth.tsv
echo ""
echo "Post nanopolish per amplicon mean coverage:"
zcat ${SAMPLE}.outputAmp.regions.bed.gz | awk '{print NR "\t" $5}'
zcat ${SAMPLE}.outputAmp.regions.bed.gz | awk '{print NR "\t" $5}' > ${SAMPLE}.outputAmp.depth.tsv
# Per amplicon N counts
# Fix header for bedtools
sed s'/>.*/>MN908947.3/' ${TMPDIR}/${SAMPLE}.consensus.fasta > ${TMPDIR}/temp.${SAMPLE}.MN908947.3.fasta
echo ""
echo "Per amplicon N counts:"
bedtools getfasta -fi ${TMPDIR}/temp.${SAMPLE}.MN908947.3.fasta -tab -bed ${AMPLICON_BED} | cut -f 2 | tr -d -c 'N\n' | awk '{ print NR "\t" length; }'
bedtools getfasta -fi ${TMPDIR}/temp.${SAMPLE}.MN908947.3.fasta -tab -bed ${AMPLICON_BED} | cut -f 2 | tr -d -c 'N\n' | awk '{ print NR "\t" length; }' > ${SAMPLE}.AmpNs.tsv
# Global depth stats
mosdepth -t ${CPU} ${SAMPLE}.input ${TMPDIR}/${SAMPLE}.sorted.bam
mosdepth -t ${CPU} ${SAMPLE}.output ${TMPDIR}/${SAMPLE}.primertrimmed.rg.sorted.bam
echo ""
echo "Minimap2 mean coverage stats:"
column -t ${SAMPLE}.input.mosdepth.summary.txt | head -n 2
IN_DEPTH=$(sed -sn 2p ${SAMPLE}.input.mosdepth.summary.txt | cut -f 4)
echo ""
echo "Post nanopolish mean coverage stats:"
column -t ${SAMPLE}.output.mosdepth.summary.txt | head -n 2
OUT_DEPTH=$(sed -sn 2p ${SAMPLE}.output.mosdepth.summary.txt | cut -f 4)
echo ""
# Copy back and rename output
# Not all of the output is needed, and files to upload will have to be renamed see:
# https://docs.covid19.climb.ac.uk/upload-instructions
# Dir for data to upload
# Needs alignment.bam and consensus.fa
echo "Creating upload dir at ${OUTPUT_DIR}/upload/${RUN_NAME}/${SAMPLE}"
mkdir -p ${OUTPUT_DIR}/upload/${RUN_NAME}/${SAMPLE}
echo "Copying upload data..."
cp ${TMPDIR}/${SAMPLE}.consensus.fasta ${OUTPUT_DIR}/upload/${RUN_NAME}/${SAMPLE}/consensus.fa
cp ${TMPDIR}/${SAMPLE}.sorted.bam ${OUTPUT_DIR}/upload/${RUN_NAME}/${SAMPLE}/alignment.bam
# Dir for all output
echo "Creating output dir at ${OUTPUT_DIR}/processed/${RUN_NAME}"
mkdir -p ${OUTPUT_DIR}/processed/${RUN_NAME}
echo "Copying ${SAMPLE}.* ..."
cp ${TMPDIR}/${SAMPLE}.* ${OUTPUT_DIR}/processed/${RUN_NAME}
# Needed for older artic v.1.1.3 which produced .pngs with a different output
# convention, commented out now as not produced in v.1.2.0 or higher
#echo "Copying ${SAMPLE}-* ..."
#cp ${TMPDIR}/${SAMPLE}-* ${OUTPUT_DIR}/processed/${RUN_NAME}
echo "Gzipping ${RUN_NAME}_${BARCODE}.fastq ..."
# Needs pigz, conda install pigz, into your artic-ncov2019 env
# Or replace with single threaded gzip, omitting -p ${CPU}
pigz -p ${CPU} ${TMPDIR}/${RUN_NAME}_${BARCODE}.fastq
echo "Copying ${RUN_NAME}_${BARCODE}.fastq.gz ..."
cp ${TMPDIR}/${RUN_NAME}_${BARCODE}.fastq.gz ${OUTPUT_DIR}/processed/${RUN_NAME}
# Run pangolin
echo ""
echo "Running Pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages"
echo "Switching conda envs"
echo "Pangolin stdout written to: ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.pangolin.out"
echo "Snakemake spam written to: ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.pangolin.err"
echo ""
conda deactivate
conda activate pangolin
pangolin -t ${CPU} -o ${TMPDIR}/${SAMPLE}.pangolin --tempdir ${TMPDIR} ${TMPDIR}/${SAMPLE}.consensus.fasta > ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.pangolin.out 2>${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.pangolin.err
cp ${TMPDIR}/${SAMPLE}.pangolin/lineage_report.csv ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.pangolin.lineage_report.csv
echo "Lineage report:"
column -t -s ',' ${TMPDIR}/${SAMPLE}.pangolin/lineage_report.csv
echo ""
# Clean-up temp dir
echo "Removing temp dir: ${TMPDIR}"
cd ${BASE_DIR}
rm -rf ${TMPDIR}
echo ""
echo ""
echo "Final stats"
echo "==========="
echo ""
echo "Reads pre guppyplex: ${READS_PRE}"
echo "Reads post guppyplex: ${READS_POST} (${PC_PLEX}%)"
echo "Aligned reads in sorted.bam: ${ALN_READS}"
echo "Number of Ns consensus: ${TOTAL_N} (${PC_N}%)"
echo "Mean depth of input alignment: ${IN_DEPTH}"
echo "Mean depth of final alignment: ${OUT_DEPTH}"
echo "Number of variants called: ${VAR}"
echo ""
# Write these stats to a file for each sample in output dir
echo "Reads pre guppyplex: ${READS_PRE}" > ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Reads post guppyplex: ${READS_POST}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Reads post guppyples %: ${PC_PLEX}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Aligned reads in sorted.bam: ${ALN_READS}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Number of Ns consensus: ${TOTAL_N}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Number of Ns consensus %: ${PC_N}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Mean depth of input alignment: ${IN_DEPTH}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Mean depth of final alignment: ${OUT_DEPTH}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
echo "Number of variants called: ${VAR}" >> ${OUTPUT_DIR}/processed/${RUN_NAME}/${SAMPLE}.stats.txt
date
END_T=$(date +%s)
RUN_T=$((${END_T}-${START_T}))
echo -ne "Run time was: "
printf '%dh:%dm:%ds\n' $((${RUN_T}/3600)) $((${RUN_T}%3600/60)) $(($RUN_T%60))
echo ""
echo "END"