-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAssignment1_script
312 lines (275 loc) · 10.3 KB
/
Assignment1_script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
#!/bin/bash
cd
#Create Assignment 1 directory in home directory
echo "Creating Assignment 1 folder"
mkdir Assignment1; cd ~/Assignment1
#Copy AY21 directory into own directory
echo "Copying assignment data into Assignment 1 folder"
cp -r /localdisk/data/BPSM/AY21 .
#Unzip files
echo "Unzipping fastq files"
cd ~/Assignment1/AY21/fastq; for gzip in *.gz; do gzip -d $gzip; done
#QUESTION 1
#Run fastqc on all .fq files
cd ~/Assignment1/AY21/fastq
echo "Running fastqc on .fq files"
fastqc *.fq
#Create directory for fastqc results
echo "Creating directory for fastqc results and moving the fastqc results to that folder"
mkdir fastqc_results
#Move fastqc results into new directory
grep -lir 'fastqc' * |xargs mv -t fastqc_results
#Unzip fastqc files
cd
cd ~/Assignment1/AY21/fastq/fastqc_results; for filename in *.zip; do unzip $filename; done
rm -f *.zip
#QUESTION 2
#Concatenate summary files into one big one
echo "Concatenating fastqc summaries into one file"
cd ~/Assignment1/AY21/fastq/fastqc_results
cat */summary.txt > fastqc_summaries.txt
#Summary of FastQC
echo "Number of raw sequences that passed one of the variables from fastqc:"
grep -c PASS fastqc_summaries.txt
#777
echo "Number of raw sequences that had WARN for one of the variables from fastqc:"
grep -c WARN fastqc_summaries.txt
#86
echo "Number of raw sequences that failed one of the variables from fastqc:"
grep -c FAIL fastqc_summaries.txt
#37
#Per sequence quality score
awk 'NR % 10 == 2' fastqc_summaries.txt > sequence_quality.txt
echo "Number of raw sequences that passed the sequence quality test:"
grep -c PASS sequence_quality.txt
#QUESTION 3
#Move to Tcongo_genome directory
cd
cd ~/Assignment1/AY21/Tcongo_genome; for gzip in *.gz; do gzip -d $gzip; done
#Make database for bowtie2
echo "Building Tcongolense_genome_database from its fasta file"
bowtie2-build TriTrypDB-46_TcongolenseIL3000_2019_Genome.fasta Tcongolense_genome_database
#Read pairs alignment to T. congolense genome using bowtie2
cd
fqdir=Assignment1/AY21/fastq
for x1 in "$fqdir"/*_1.fq; do
x2=${x1%_1.fq}_2.fq
if [[ -f $x2 ]] ; then
echo "Aligning $x1 and $x2 to Tcongolense_genome_database"
bowtie2 --threads 60 -x ~/Assignment1/AY21/Tcongo_genome/Tcongolense_genome_database -1 "$x1" -2 "$x2" -S "${x1%.*}"_output.sam
else
echo "$x2 not found" >&2
fi
done
cd; mkdir Assignment1/AY21/genome_alignment
mv Assignment1/AY21/fastq/*.sam Assignment1/AY21/genome_alignment
cd; cd Assignment1/AY21/genome_alignment
#Convert output to indexed "bam" format with samtools
for file in $(ls *.sam);
do
# ${file%.*} means that it will take the file name and remove everything
# after the last punctuation in the name.
echo "Converting $file to ${file%.*}.bam"
samtools view -bS $file > ${file%.*}.bam
done
#QUESTION 4
cd
cd Assignment1/AY21
mv TriTrypDB-46_TcongolenseIL3000_2019.bed genome_alignment
cd genome_alignment
for file in $(ls *.bam);
do
echo "Generating counts data for $file"
bedtools coverage -a TriTrypDB-46_TcongolenseIL3000_2019.bed -b $file -counts > ${file%.*}.bed
done
# -a is the bed file
# -b is the reads alignment bam file
mkdir sam_files; mv *.sam sam_files
mkdir bam_files; mv *.bam bam_files
mkdir bed_files; mv *.bed bed_files
#QUESTION 5
cd
cd Assignment1/AY21/genome_alignment/bed_files
#Group 1 - WT: t=0, Uninduced
echo "Generating text file with mean of the counts per gene for WT, t=0, uninduced: WT_t0_U"
cut -f4,5 100k.WT-1-507_1_output.bed > WT_t0_U.txt
for f in 100k.WT-1-507_1_output.bed 100k.WT-2-511_1_output.bed 100k.WT-3-512_1_output.bed
do
cut -f6 "$f" | paste WT_t0_U.txt - > tmp.txt
mv {tmp,WT_t0_U}.txt
done
#Group 2 - WT: t=24, Uninduced
echo "Generating text file with mean of the counts per gene for WT, t=24, Uninduced: WT_t24_U"
cut -f4,5 100k.WT-4-530_1_output.bed > WT_t24_U.txt
for f in 100k.WT-4-530_1_output.bed 100k.WT-5-531_1_output.bed 100k.WT-6-532_1_output.bed
do
cut -f6 "$f" | paste WT_t24_U.txt - > tmp.txt
mv {tmp,WT_t24_U}.txt
done
#Group 3 - WT: t=24, Induced
echo "Generating text file with mean of the counts per gene for WT, t=24, Induced: WT_t24_I"
cut -f4,5 100k.WT-1-525_1_output.bed > WT_t24_I.txt
for f in 100k.WT-1-525_1_output.bed 100k.WT-2-526_1_output.bed 100k.WT-3-529_1_output.bed
do
cut -f6 "$f" | paste WT_t24_I.txt - > tmp.txt
mv {tmp,WT_t24_I}.txt
done
#Group 4 - WT: t=48, Uninduced
echo "Generating text file with mean of the counts per gene for WT, t=48, Uninduced: WT_t48_U"
cut -f4,5 100k.WT-4-553_1_output.bed > WT_t48_U.txt
for f in 100k.WT-4-553_1_output.bed 100k.WT-5-554_1_output.bed 100k.WT-6-555_1_output.bed
do
cut -f6 "$f" | paste WT_t48_U.txt - > tmp.txt
mv {tmp,WT_t48_U}.txt
done
#Group 5 - WT: t=48, Induced
echo "Generating text file with mean of the counts per gene for WT, t=48, Induced: WT_t48_I"
cut -f4,5 100k.WT-1-550_1_output.bed > WT_t48_I.txt
for f in 100k.WT-1-550_1_output.bed 100k.WT-2-551_1_output.bed 100k.WT-3-552_1_output.bed
do
cut -f6 "$f" | paste WT_t48_I.txt - > tmp.txt
mv {tmp,WT_t48_I}.txt
done
#Group 6 - Clone 1: t=0, Uninduced
echo "Generating text file with mean of the counts per gene for Clone 1, t=0, Uninduced: C1_t0_U"
cut -f4,5 100k.C1-1-501_1_output.bed > C1_t0_U.txt
for f in 100k.C1-1-501_1_output.bed 100k.C1-2-502_1_output.bed 100k.C1-3-503_1_output.bed
do
cut -f6 "$f" | paste C1_t0_U.txt - > tmp.txt
mv {tmp,C1_t0_U}.txt
done
#Group 7 - Clone 1: t=24, Uninduced
echo "Generating text file with mean of the counts per gene for Clone 1, t=24, Uninduced: C1_t24_U"
cut -f4,5 100k.C1-4-516_1_output.bed > C1_t24_U.txt
for f in 100k.C1-4-516_1_output.bed 100k.C1-5-517_1_output.bed 100k.C1-6-518_1_output.bed
do
cut -f6 "$f" | paste C1_t24_U.txt - > tmp.txt
mv {tmp,C1_t24_U}.txt
done
#Group 8 - Clone 1: t=24, Induced
echo "Generating text file with mean of the counts per gene for Clone 1, t=24, Induced: C1_t24_I"
cut -f4,5 100k.C1-1-513_1_output.bed > C1_t24_I.txt
for f in 100k.C1-1-513_1_output.bed 100k.C1-2-514_1_output.bed 100k.C1-3-515_1_output.bed
do
cut -f6 "$f" | paste C1_t24_I.txt - > tmp.txt
mv {tmp,C1_t24_I}.txt
done
#Group 9 - Clone 1: t=48, Uninduced
echo "Generating text file with mean of the counts per gene for Clone 1, t=48, Uninduced: C1_t48_U"
cut -f4,5 100k.C1-4-536_1_output.bed > C1_t48_U.txt
for f in 100k.C1-4-536_1_output.bed 100k.C1-5-539_1_output.bed 100k.C1-6-541_1_output.bed
do
cut -f6 "$f" | paste C1_t48_U.txt - > tmp.txt
mv {tmp,C1_t48_U}.txt
done
#Group 10 - Clone 1: t=48, Induced
echo "Generating text file with mean of the counts per gene for Clone 1, t=48, Uninduced: C1_t48_I"
cut -f4,5 100k.C1-1-533_1_output.bed > C1_t48_I.txt
for f in 100k.C1-1-533_1_output.bed 100k.C1-2-534_1_output.bed 100k.C1-3-535_1_output.bed
do
cut -f6 "$f" | paste C1_t48_I.txt - > tmp.txt
mv {tmp,C1_t48_I}.txt
done
#Group 11 - Clone 2: t=0, Uninduced
echo "Generating text file with mean of the counts per gene for Clone 2, t=0, Uninduced: C2_t0_U"
cut -f4,5 100k.C2-1-504_1_output.bed > C2_t0_U.txt
for f in 100k.C2-1-504_1_output.bed 100k.C2-2-505_1_output.bed 100k.C2-3-506_1_output.bed
do
cut -f6 "$f" | paste C2_t0_U.txt - > tmp.txt
mv {tmp,C2_t0_U}.txt
done
#Group 12 - Clone 2: t=24, Uninduced
echo "Generating text file with mean of the counts per gene for Clone 2, t=24, Uninduced: C2_t24_U"
cut -f4,5 100k.C2-4-522_1_output.bed > C2_t24_U.txt
for f in 100k.C2-4-522_1_output.bed 100k.C2-5-523_1_output.bed 100k.C2-6-524_1_output.bed
do
cut -f6 "$f" | paste C2_t24_U.txt - > tmp.txt
mv {tmp,C2_t24_U}.txt
done
#Group 13 - Clone 2: t=24, Induced
echo "Generating text file with mean of the counts per gene for Clone 2, t=24, Induced: C2_t24_I"
cut -f4,5 100k.C2-1-519_1_output.bed > C2_t24_I.txt
for f in 100k.C2-1-519_1_output.bed 100k.C2-2-520_1_output.bed 100k.C2-3-521_1_output.bed
do
cut -f6 "$f" | paste C2_t24_I.txt - > tmp.txt
mv {tmp,C2_t24_I}.txt
done
#Group 14 - Clone 2: t=48, Uninduced
echo "Generating text file with mean of the counts per gene for Clone 2, t=48, Uninduced: C2_t48_U"
cut -f4,5 100k.C2-4-545_1_output.bed > C2_t48_U.txt
for f in 100k.C2-4-545_1_output.bed 100k.C2-5-546_1_output.bed 100k.C2-6-548_1_output.bed
do
cut -f6 "$f" | paste C2_t48_U.txt - > tmp.txt
mv {tmp,C2_t48_U}.txt
done
#Group 15 - Clone 2: t=48, Induced
echo "Generating text file with mean of the counts per gene for Clone 2, t=48, Induced: C2_t48_I"
cut -f4,5 100k.C2-1-542_1_output.bed > C2_t48_I.txt
for f in 100k.C2-1-542_1_output.bed 100k.C2-2-543_1_output.bed 100k.C2-3-544_1_output.bed
do
cut -f6 "$f" | paste C2_t48_I.txt - > tmp.txt
mv {tmp,C2_t48_I}.txt
done
for file in *_U.txt
do
awk '{sum = 0; for (i = 0; i <= NF; i++) sum += $i; sum /= 3; print sum}' $file > $file_temp.txt
cut -f1 $f_temp.txt | paste $file - > tmp.txt
mv {tmp,${file%.*}}.txt
rm -f $file_temp.txt
done
for file in *_I.txt
do
awk '{sum = 0; for (i = 0; i <= NF; i++) sum += $i; sum /= 3; print sum}' $file > $file_temp.txt
cut -f1 $f_temp.txt | paste $file - > tmp.txt
mv {tmp,${file%.*}}.txt
rm -f $file_temp.txt
done
#removing counts data of each replicate to only be left with mean average
for file in *.txt
do
cut -f1,2, < $file > $file.new
mv $file.new $file
done
cd
cd Assignment1/AY21
mkdir expression_level
cd
mv Assignment1/AY21/genome_alignment/bed_files/*.txt Assignment1/AY21/expression_level
#QUESTION 6
cd
cd Assignment1/AY21
mkdir fold_change
cd
cd Assignment1/AY21/expression_level
for i in *.txt
do
for j in *.txt
do
if [ "$i" \< "$j" ]
then
echo "Pairing groups ${i%.*} and ${j%.*} to generate fold change data"
cut -f1,2 C2_t24_I.txt > ${i%.*}.vs.${j%.*}.txt
cut -f6 $i | paste ${i%.*}.vs.${j%.*}.txt - > tmp.txt
mv {tmp,${i%.*}.vs.${j%.*}}.txt
cut -f6 $j | paste ${i%.*}.vs.${j%.*}.txt - > tmp.txt
mv {tmp,${i%.*}.vs.${j%.*}}.txt
fi
done
done
cd
find Assignment1/AY21/expression_level/*.txt -name "*.vs.*" -exec mv -t Assignment1/AY21/fold_change {} +
cd Assignment1/AY21/fold_change
for file in *.txt
do
echo "Generating fold change data for $file"
awk '{sum = 0; for (i = 0; i <= NF; i++) sum += $i; sum /= 2; print sum}' $file > $file_temp.txt
cut -f1 $f_temp.txt | paste $file - > tmp.txt
mv {tmp,${file%.*}}.txt
rm -f $file_temp.txt
done
#removing averages of the two groups to only have fold change
for file in *.txt; do
cut -f1,2,5 < $file > $file.new
mv $file.new $file
done
echo "Project Finished!"