-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathprepare-numeric-edges.py
42 lines (29 loc) · 1.36 KB
/
prepare-numeric-edges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm
from util import Dataset
print "Loading data..."
num_features = Dataset.get_part_features('numeric')
train_num = Dataset.load_part('train', 'numeric')
test_num = Dataset.load_part('test', 'numeric')
train_n = train_num.shape[0]
features = []
train_res = []
test_res = []
with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar:
for col, col_name in enumerate(num_features):
values = np.hstack((train_num[:, col], test_num[:, col]))
if (values == 0.0).sum() > 20:
features.append(col_name + '_zero')
train_res.append((values[:train_n] == 0.0).astype(np.uint8).reshape((train_num.shape[0], 1)))
test_res.append((values[train_n:] == 0.0).astype(np.uint8).reshape((test_num.shape[0], 1)))
if (values == 1.0).sum() > 20:
features.append(col_name + '_one')
train_res.append((values[:train_n] == 1.0).astype(np.uint8).reshape((train_num.shape[0], 1)))
test_res.append((values[train_n:] == 1.0).astype(np.uint8).reshape((test_num.shape[0], 1)))
pbar.update(1)
print "Saving..."
Dataset.save_part_features('numeric_edges', features)
Dataset(numeric_edges=sp.csr_matrix(np.hstack(train_res))).save('train')
Dataset(numeric_edges=sp.csr_matrix(np.hstack(test_res))).save('test')
print "Done."