-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmulti_bbh.pl
executable file
·224 lines (189 loc) · 7.31 KB
/
multi_bbh.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/bin/perl
use strict;
use Getopt::Std;
use Bio::FeatureIO;
use lib $ENV{ORTHO};
use OrthologAnalysis;
my $usage = "
# This program looks for bi-directional best hits between organism pairs.
# Input is blastp output from an all_v_all search between multiple genomes.
# The top non-self hit for each gene to each genome is stored in a hash, then we go
# through each protein from one of the genomes and check for reciprocal
# best hits. Cutoffs are >30% id over > 70% of the query length
# for the first iteration. (The length cutoff only has to be unidirectional
# to accommodate fusion proteins.) The avg and stdev for %id (AAI) are
# calculated from the initial results. In the second iteration, the cutoff
# is set to avg-2*stdev, constraining the 'best' hits to reasonable scores.
# Clear?
USAGE: multi_bbh.pl -i inputfile -p results_dir [-e fasta_file_ext]
-i - blast_tab.pl output from an all vs all blastp search.
-p - file path to directory containing input files and all_v_all results
-e - file extension used for input fasta files (default 'faa')
";
my %arg;
&getopts('i:p:e:h', \%arg);
if ($arg{'h'}) { die $usage }
my $file = $arg{'i'} or die "Need to provide input btab file with -i\n";
my $path = $arg{'p'};
my $ext = $arg{'e'} ? $arg{'e'} : 'faa';
my $VERBOSE = 1;
our $length_cutoff = 0.7;
our $id_cutoff = 30;
# still have the problem of assocating proteins with genomes, since it's not explicit in the
# pair-wise output. In this instance, we will use the fasta files, since they were the
# original input for the searches.
my ($PtoDB, $DBtoP) = &parse_protein_fasta_files($path, $ext);
my $HITS = &read_file($file, $PtoDB); # populates %HITS
our %BBH;
my $per_id_ref = &look_for_bbh(keys %$DBtoP); # populates %BBH
my @DB = sort keys %$per_id_ref;
# Roll through the pair-wise comparisons
for (my $i=0; $i<@DB; $i++) {
my $db1 = $DB[$i];
print STDERR "Doing $db1...\n";
for (my $j=$i+1; $j<@DB; $j++) {
my $db2 = $DB[$j];
print STDERR "...with $db2\n";
if (! defined $per_id_ref->{$db1}->{$db2}) { next } # If we don't have scores for these two dbs
# Calculate the AAI for the two genomes so we can set a threshhold.
my $index = scalar(@{$per_id_ref->{$db1}->{$db2}}); # how many bbhs are there?
my $sum = 0;
foreach (@{$per_id_ref->{$db1}->{$db2}}) { $sum += $_ } # sum the AAIDs
my $avg = $sum/$index; # calculate the average
# Calculate the stdev for bbh AAI
my $x = 0;
foreach (@{$per_id_ref->{$db1}->{$db2}}) { $x += ($_ - $avg)**2 }
my $stdev = sqrt($x/$index);
# The threshhold is 2 stdev from the average.
my $keep_threshold = sprintf "%.2f\n",($avg-2*$stdev);
if ($keep_threshold < $id_cutoff) { $keep_threshold = $id_cutoff }
if ($VERBOSE) {
print "$db1 - $db2\n";
print "index = $index\n";
printf "Avg per_id = %.2f\n",$avg;
printf "St dev = %.2f\n",$stdev;
print "Keep threshold = $keep_threshold\n";
}
open OUT, ">$path/${db1}_BBH_${db2}";
# we want to print out because this is the multiparanoid input
# OID bits 'species' 'IPscore' gene boostrap
# 1 1302 dros_sh_dfl_12.pep 1.000 S0_Dshi_0002 100%
# 1 1302 bin05.pep 1.000 bin05_02438 100%
# some of it we'll make up (like IPscore and bootstrap)
my $index = 0;
foreach my $p (sort {$a cmp $b} keys %BBH) {
if ($PtoDB->{$p} ne $db1) { next }
if (defined $BBH{$p}->{$db2}
&& $HITS->{$p}->{$db2}->{'per_id'} >= $keep_threshold
) {
$index++;
# multiParanoid input
print OUT "$index\t$HITS->{$p}->{$db2}->{bits}\t$db1\t$HITS->{$p}->{$db2}->{score}\t$p\t100%\n";
print OUT "$index\t$HITS->{$BBH{$p}->{$db2}}->{$db1}->{bits}\t$db2\t$HITS->{$BBH{$p}->{$db2}}->{$db1}->{score}\t$BBH{$p}->{$db2}\t100%\n";
}
}
close OUT;
}
}
exit();
sub look_for_bbh {
my @DB = @_;
my %per_id_ref;
#my %BBH;
# Go through each protein
foreach my $p (keys %$HITS) {
my $db = $PtoDB->{$p};
if (!$db) { die "Where did this protein come from? It's not in any of the parsed fasta files: $p\n";}
# go through each db
foreach my $qdb (@DB) {
# skip if there's no hits
if (! defined $HITS->{$p}->{$qdb}->{'tophit'}) { next }
my $q = $HITS->{$p}->{$qdb}->{'tophit'}; # ie top hit from other genome
# Check for unidirectional hits.
if (! defined $HITS->{$q}->{$db}) { # perhaps because length cutoff wasnt' met for a fusion protein...
# Not sure why this code is here. Do we really want to include
# unidirectional hits?
# print OUT "$p\t$q\t$HITS{$p}->{$qdb}->{per_id}\t!!! no reciprocal hit\n";
# $BBH{$p} = {$q};
}
# This is the magic - when two proteins are reciprocal best matches
elsif ($HITS->{$q}->{$db}->{'tophit'} eq $p) {
if ($BBH{$p}->{$qdb} eq $q && $BBH{$q}->{$db} eq $p) { next } # we already found this one
# print "$p\t$q\t$HITS{$p}->{$qdb}->{bits}\n";
# Add per_ids to array for AAAI calculation
push @{$per_id_ref{$db}->{$qdb}}, ($HITS->{$p}->{$qdb}->{'per_id'}, $HITS->{$q}->{$db}->{'per_id'});
push @{$per_id_ref{$qdb}->{$db}}, ($HITS->{$p}->{$qdb}->{'per_id'}, $HITS->{$q}->{$db}->{'per_id'});
$BBH{$p}->{$qdb} = $q;
$BBH{$q}->{$db} = $p;
}
}
}
return (\%per_id_ref);
}
sub read_file {
my $file = shift;
my $PtoDB = shift;
print STDERR "Reading file $file\n";
open my $IN, $file or die "Can't open file $file : $!\n";
my $header;
my $last;
while (my $line = <$IN>) {
my ($qryacc, $refacc, $bits, $per_id, $qry_len, $ref_len);
# Need the header stuff for MSOAR, if we're using that, which we aren't.
if ($line =~ /^\#/) {
$header .= $line;
next;
}
chomp $line;
my @f = split/\t/, $line;
if ($line && @f == 14) {
# tab_blast data
$qryacc = id_from_accession($f[0]);
$refacc = id_from_accession($f[4]);
$bits = $f[10];
$per_id = $f[12];
$qry_len = $f[1];
$ref_len = $f[6];
} elsif ($line && @f == 12) {
# mtab data
$qryacc = id_from_accession($f[0]);
$refacc = id_from_accession($f[1]);
$bits = $f[11] * 1;
$per_id = $f[2];
$qry_len = 1;
$ref_len = 1;
} else {
die "Don't recognize format of tabulated blast output\n";
}
if (! defined $PtoDB->{$qryacc}) {
die "Hey, fool. The accession $f[0] ($qryacc) isn't in the fasta file. Huh?\n";
}
if (! defined $PtoDB->{$refacc}) {
die "Hey, fool. The accession $f[4] ($refacc) isn't in the fasta file. Huh?\n";
}
# self hit
if ($qryacc eq $refacc) {
$HITS->{$qryacc}->{self_hit} = $bits;
next;
}
# paralogous hit (in same genome)
elsif ($PtoDB->{$qryacc} eq $PtoDB->{$refacc}) { next }
# This is the meat right here. Comparison against length and id cutoffs
# populates the Data table.
if (!defined $HITS->{$qryacc}->{$PtoDB->{$refacc}} && # this is the top hit to this genome
$per_id >= $id_cutoff &&
$ref_len/$qry_len >= $length_cutoff) {
# why do I calculate this score and then not use it? I guess this gets piped to paranoid, and it doesn't like non-1 values?
my $score = $HITS->{$qryacc}->{self_hit} ? $bits/$HITS->{$qryacc}->{self_hit} : 1;
$HITS->{$qryacc}->{$PtoDB->{$refacc}} = {'header' => $header,
'tophit' => $refacc,
'per_id' => $per_id,
'bits' => $bits,
'score' => 1,
'blast_data' => $line};
}
$header = "";
}
close $IN;
return $HITS;
}