-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathiedb_csv2fasta
executable file
·65 lines (58 loc) · 4.14 KB
/
iedb_csv2fasta
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env perl
use strict;
use warnings;
# The exported data from IEDB.org will look something like this:
# Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Epitope,Related Object,Related Object,Related Object,Related Object,Related Object,Related Object,Related Object,Related Object,Related Object,Related Object,Related Object
# Epitope ID,Object Type,Description,Epitope Modified Residue(s),Epitope Modification(s),Starting Position,Ending Position,Non-peptidic epitope Accession,Epitope Synonyms,Antigen Name,Antigen Accession,Parent Protein,Parent Protein Accession,Organism Name,Parent Organism,Parent Organism ID,Epitope Comments,Epitope Relationship,Object Type,Description,Starting Position,Ending Position,Non-peptidic object Accession,Synonyms,Antigen Name,Parent Protein,Organism Name,Parent Organism
# "6668","Linear peptide","CMTSCCSCLK","","","1236","1245","","","surface glycoprotein [Severe acute respiratory syndrome coronavirus 2]","YP_009724390.1","sp|P59594|SPIKE_CVHSA Spike glycoprotein OS=Human SARS coronavirus OX=694009 GN=S PE=1 SV=1","P59594","Severe acute respiratory syndrome coronavirus 2","Severe acute respiratory syndrome-related coronavirus","694009","The epitope region is deduced from a reactive overlapping peptide pool.","","","","","","","","","","",""
# "997006","Discontinuous peptide","Y369, N370, S371, A372, F374, F377, K378, C379, Y380, G381, V382, S383, P384, T385, K386, L390, F429, T430, F515, E516, L517","","","","","","","surface glycoprotein [Severe acute respiratory syndrome coronavirus 2]","QHD43416.1","sp|P59594|SPIKE_CVHSA Spike glycoprotein OS=Human SARS coronavirus OX=694009 GN=S PE=1 SV=1","P59594","Severe acute respiratory syndrome coronavirus 2","Severe acute respiratory syndrome-related coronavirus","694009","The epitope residues were calculated from [PDB: 6W41] as the antigen residues at 4Å atomic distance from the antibody.","","","","","","","","","","",""
@ARGV == 3 or die "Usage: $0 <IEDB epitope search result download input.csv> <linear epitope output.fasta> <discontinuous epitope output.fasta>\n";
my $csv = shift @ARGV;
my $linear_fasta = shift @ARGV;
my $discontinuous_fasta = shift @ARGV;
open(CSV, $csv)
or die "Cannot open $csv for reading: $!\n";
<CSV>;
<CSV>; # two header lines
open(LINEAR_FASTA, ">$linear_fasta")
or die "Cannot open $linear_fasta for writing: $!\n";
open(DISCONTINUOUS_FASTA, ">$discontinuous_fasta")
or die "Cannot open $discontinuous_fasta for writing: $!\n";
while(<CSV>){
s/^"|"$//g;
my @F = split /","/, $_;
if($F[1] eq "Linear peptide"){
# IEDB epitopes can have modifications, let's remove them (e.g. TEIYQAGSTPCNG + DEAM(N12) should just be TEIYQAGSTPCNG for our purposes)
$F[2] =~ s/\s*\+\s*\S+\(.*?\)//g;
print LINEAR_FASTA ">$F[0]\n$F[2]\n";
}
elsif($F[1] eq "Discontinuous peptide"){
my @epitope_residues = split /,/, $F[2];
next unless $#epitope_residues;
# Occasionally the residues are not in positional order, but we need them to be for the math below
@epitope_residues = sort {my($an) = $a =~ /(\d+)/; my($bn) = $b =~ /(\d+)/; $an <=> $bn} @epitope_residues;
my $first_pos = $epitope_residues[0];
my $last_pos = $epitope_residues[$#epitope_residues];
if($first_pos =~ s/^\s*[A-Z]// and $last_pos =~ s/^\s*[A-Z]//){
my @epitope = ("X") x ($last_pos - $first_pos + 1);
for my $residue_spec (@epitope_residues){
my ($residue, $pos) = $residue_spec =~ /^\s*([A-Z])(\d+)\s*$/;
if(not defined $residue or not defined $pos){
die "Cannot parse residue information, expected [A-Z]\\d+ but found $residue_spec ($csv line #$.)\n";
}
$epitope[$pos-$first_pos] = $residue
}
# Only keep the epitopes that have enough contiguous non-X AA residues to stand a fighting chance of alignment in TBLASTN downstream.
my $epitope = join("", @epitope);
next unless $epitope =~ /[^X]{2}/;
print DISCONTINUOUS_FASTA ">$F[0]\n$epitope\n";
}
else{
die "Cannot parse residue information, expected [A-Z]\\d+ but found $first_pos and $last_pos ($csv line #$.)\n";
}
}
# else: Ignore non-peptide antigens
}
close(CSV);
close(LINEAR_FASTA);
close(DISCONTINUOUS_FASTA);