-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy patharthropodBorne_featureSelection.R
101 lines (77 loc) · 2.98 KB
/
arthropodBorne_featureSelection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Babayan, Orton & Streicker
Predicting Reservoir Hosts and Arthropod Vectors from Evolutionary Signatures in RNA Virus Genomes
-- Genomic bias feature selection for arthropod-borne transmission model
"""
rm(list=ls())
setwd("") # Set local working directory where files are located
library(plyr)
library(h2o) # https://www.h2o.ai/products/h2o/
library(dplyr)
library(reshape2)
library(matrixStats)
# Start h2o JVM
localh20<-h2o.init(nthreads = -1)
# Read data from file
f1<-read.csv("BabayanEtAl_VirusData.csv")
# Feature definition:
## Genomic features: select all combinations of dinucleotides, codon pairs, amino-acids, and codons (biases)
dinucs<-grep("[A|T|G|C|U]p[A|T|G|C|U]",names(f1),value=T)
cps<-grep(".[A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|X|Y]..[A|T|G|C|U]",names(f1),value=T)
aa.codon.bias<-grep(".Bias",names(f1),value=T)
gen.feats<-c(dinucs,cps,aa.codon.bias)
total.feats<-length(gen.feats)
## Append metadata to feature set
f1<-f1[,c("Virus.name","Genbank.accession","Reservoir","Viral.group","Vector.borne","Vector",gen.feats)]
f1$response<-factor(f1$Vector.borne)
# Remove viruses with unknown arthropod-borne status
f2<-subset(f1,f1$Vector.borne!="?")
f_v<-droplevels(f2)
# Clean up temporary files
rm(f1,f2)
# Evaluate patterns over many training sets
set.seed(78910) # ensure reproducible runs
s<-.7 # proportion in the training set
nloops<-50 # allow model-training loop to run on 'nloops' different training-test set combinations
vimps<-matrix(nrow=total.feats-2,ncol=nloops)
for (i in 1:nloops){
# Build training set
trains<-f_v %>% group_by(Vector.borne) %>%
filter(Genbank.accession %in% sample(unique(Genbank.accession), floor(s*length(unique(Genbank.accession)))))
set<-c("Vector.borne",gen.feats)
f1_train<-trains[,c(set)]
# Build h2o data frames
train<-as.h2o(f1_train)
# Identity the response column
y <- "Vector.borne"
# Identify the predictor columns
x <- setdiff(names(train), y)
# Convert response to factor
train[,y] <- as.factor(train[,y])
# GBM with 5x cross validation of training set, test set is not used
model <- h2o.gbm(x = x,
y = y,
training_frame = train,
ntrees = 150,
learn_rate = .1,
sample_rate = 1,
max_depth = 10,
col_sample_rate_per_tree = 1,
seed = 123,
nfolds = 5,
keep_cross_validation_predictions=T)
vi <- h2o.varimp(model)
data2 <- vi[order(vi[,1],decreasing=FALSE),] # order alphabetically
vimps[,i]<-data2[,4] # "percentage" importance
h2o.rm(model)
rm(train,f1_train,vi)
}
row.names(vimps)<-data2$variable
# Average feature importance across all training sets
vimean<-rowMeans2(vimps)
visd<-rowSds(vimps)
vistderr<-visd/sqrt(nloops)
vimps<-cbind(vimps,vimean,visd,vistderr)
vimps<- vimps[order(vimps[,nloops+1],decreasing=FALSE),]
# Write to file
write.csv(vimps,file="featureImportance_arthropodBorne.csv",row.names = T)