-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgtf2fasta.sh
executable file
·73 lines (63 loc) · 2.17 KB
/
gtf2fasta.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash
#PBS -l nodes=1:ppn=4
GENOME="mm9"
#### usage ####
usage() {
echo
echo Program: "gtf2fasta.sh (retrieve nucleotide sequences corresponding to transcripts in gtf format)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: [email protected]
echo "Usage: gtf2fasta.sh -i <file> [OPTIONS]"
echo "Options:"
echo " -i <file> [input GTF file]"
echo " -o <file> [output fasta file]"
echo "[OPTIONS]"
echo " -g <genome> [reference genome (default: mm9)]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:o:g:h ARG; do
case "$ARG" in
i) GTFFILE=$OPTARG;;
o) FASTAFILE=$OPTARG;;
g) GENOME=$OPTARG;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ ! -f "$GTFFILE" -o -z "$FASTAFILE" -o "$HELP" ]; then
usage
fi
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
echo -n "Populating files based on input genome ($GENOME)... "
if [ "$GENOME" == "mm9" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.fa"
elif [ "$GENOME" == "hg19" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.fa"
else
echo "Presently the program only support analysis for mm9 or hg19"
echo
usage
fi
echo done
echo -n "format GTF file... "
ID=$(echo $GTFFILE | sed 's/\.GTF/_mod/ig');
grep -v "track" $GTFFILE | perl -an -F'/\t/' -e '@t=split(/\;/, $F[8]); $line=(); foreach(@t) { @u=split(/\s+/,$_); $des=(); foreach(@u[0..scalar(@u)-2]) { if($_=~/^$/) { next; } $des.=$_."_"; } $des=~s/\_$//g; $line.="$des \"$u[scalar(@u)-1]\"\; "; } print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t$F[5]\t$F[6]\t$F[7]\t$line\n";' | perl -an -F'/\t/' -e '@t=split(/\;/,$F[8]); $id="NA"; foreach(@t) { if($_=~/isoform/i || $_=~/transcript/i) { $_=~s/^.*\s+//g; $_=~s/\"//g; $id=$_;} } print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t$F[5]\t$F[6]\t$F[7]\t$id\n";' > $ID.GTF
echo "done"
echo -n "retrieve transcript sequence in fasta format... "
gffread -w $FASTAFILE -g $GENOME_FILE $ID.GTF
rm $ID.GTF
echo "done"