-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathensemble.py
130 lines (114 loc) · 4.34 KB
/
ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
'''
A simple script for making ensemble classifiers with random forests and NBSVMs
'''
import pandas as pd
import numpy as np
import re
from scipy.stats import gmean
import tools
from nbsvm import NBSVM
from rf import TextRF
'''
Main class for the model Ensemble. Methods generally follow scikit-learn style,
with the exceptions of add(), remove(), and score_sep(), which are specific
to this module.
'''
class Ensemble:
def __init__(self):
self.mods = {}
self.accs = {}
self.data = []
self.probs = []
self.__name__ = 'Ensemble'
#adds a model to the ensemble
def add(self, model):
if 'sklearn' in model.__module__:
modname = re.sub('sklearn', '', model.__module__)
self.mods[modname] = model
self.accs[modname] = 0.0
else:
self.mods[model.__name__] = model
self.accs[model.__name__] = 0.0
return
# Removes a model from the ensemble
def remove(self, name):
del self.mods[name]
del self.accs[name]
return
# Fits the models to the training data
def fit(self, X, y):
for mod in self.mods:
self.mods.get(mod).fit(X, y)
return
# Scoresthe models on the test data
def score_sep(self, X, y, verbose=True):
for mod in self.mods:
self.accs[mod] = self.mods.get(mod).score(X, y)
if verbose:
print self.accs
return
# Scores the ensemble on the test data
def score(self, X, y, method='geometric', threshold=0.5):
probs = self.predict_proba(X, y)
if method == 'geometric':
mean_probs = gmean(probs, axis=1)
guesses = [int(x >= threshold) for x in mean_probs]
acc = np.true_divide(np.sum(guesses == y), len(y))
return acc
# Predicts results with the test data
def predict(self, X, y, method='geometric', threshold=0.5):
probs = self.predict_proba(X, y)
if method == 'geometric':
mean_probs = gmean(probs, axis=1)
guesses = [int(x >= threshold) for x in mean_probs]
return np.array(guesses)
# Gets predicted probabilities for the test data
def predict_proba(self, X, y, mean=False):
probs = pd.DataFrame(np.zeros([X.shape[0], len(self.mods)]))
probs.columns = self.mods.keys()
for i in range(len(self.mods)):
if self.mods.keys()[i] != 'nbsvm':
probs.iloc[:, i] = self.mods.values()[i].predict_proba(X)[:,1]
else:
probs.iloc[:, i] = self.mods['nbsvm'].predict_proba(X, y)
if mean:
return gmean(probs, axis=1)
else:
return probs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
#positional arguments
parser.add_argument('data', help='path for the input data')
parser.add_argument('x_name', help='name of the column holding the text')
parser.add_argument('y_name', help='name of the column holding the target values')
#optional arguments for tuning
parser.add_argument('-lm', '--limit_features', type=bool, default=True, help='limit the number of features?')
parser.add_argument('-ft', '--features', type=int, default=35000, help='number of features, if limited')
parser.add_argument('-ng', '--ngrams', type=int, default=2, help='max ngram size')
parser.add_argument('-vm', '--vote_method', default='geometric', help='how to combine the class probabilities for scoring')
parser.add_argument('-vc', '--vectorizer', default='tfidf', help='how to vectorize the corpus')
parser.add_argument('-sm', '--split_method', default='train-test', help='split the data by var(iable), train-test, or cross-val')
parser.add_argument('-sv', '--split_variable', help='which variable to use for splitting')
parser.add_argument('-tv', '--test_value', help='which value of --split_variable to use for testing')
parser.add_argument('-vb', '--verbose', default=True, help='should functions print updates as they go?')
args = parser.parse_args()
#loading and processing the data
df = pd.read_csv(args.data)
d = tools.TextData()
if args.limit_features:
d.process(df, args.x_name, args.y_name, method=args.vectorizer, max_features=args.features, verbose=args.verbose)
else:
d.process(df, args.x_name, args.y_name, method=args.vectorizer, max_features=None, verbose=args.verbose)
#getting the training and test sets
d.split(args.split_method, args.split_variable, args.test_value)
#adding the models
ens = Ensemble()
ens.add(TextNBSVM())
ens.add(TextRF())
ens.fit(d.X_train, d.y_train)
ens.score_sep(d.X_test, d.y_test, verbose=False)
acc = ens.score(d.X_test, d.y_test, method=args.vote_method)
if args.verbose:
print '\nResults:'
print ens.accs
print 'Ensemble accuracy is %0.4f' %acc