-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsemiSurpervisedDataset.py
233 lines (178 loc) · 10.1 KB
/
semiSurpervisedDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import numpy as np
import pandas as pd
import math
import pickle as pkl
from tqdm import tqdm
import torch
import copy
import configparser
config=configparser.ConfigParser()
config.read('./config.ini')
np.random.seed(0)
class SemiSupervisedDataset(object):
def __init__(self,curated_data_path,curated_csv,noisy_data_path,noisy_csv,val_data_path,val_csv,batch_size = 64,duration = 128,masking_max_percentage=0.15):
self.duration = duration
self.batch_size = batch_size
self.masking_max_percentage = masking_max_percentage
print ('loading curated data from pkl...')
with open(curated_data_path, 'rb') as f:
self.curated_data= pkl.load(f)
with open(val_data_path,'rb') as f:
self.val_data = pkl.load(f)
print ('loading noisy data from pkl...')
with open(noisy_data_path, 'rb') as f:
self.noisy_data= pkl.load(f)
self.curated_csv_df = pd.read_csv(curated_csv)
self.val_csv_df = pd.read_csv(val_csv)
self.noisy_csv_df = pd.read_csv(noisy_csv)
# reference dataframe to get labels order
self.ref_df = pd.read_csv(config.get('DataPath','ref_csv_path'))
self.labels_str = self.ref_df.columns[1:].tolist() # a map label string -> index
self.num_classes= len(self.labels_str)
self.init_labeled_unlabeled_data()
def get_class_num(self):
return self.num_classes
def get_numof_batch(self,istraindata=True):
if istraindata:
return int(math.ceil (len(self.labeled_data_epoch)/self.batch_size))
else:
return int(math.ceil(len(self.val_data)/self.batch_size))
def get_samplenum_perEpoch(self):
assert len(self.unlabeld_data_epoch ) == len(self.labeled_data_epoch),'labeled data and unlabeled data len not match'
return len(self.labeled_data_epoch)*2 # containing N/2 labeled and N/2 unlabeled data
def init_labeled_unlabeled_data(self):
"""
initialize labeled and unlabled data
will get:
1. self.val_labels: list of np.array read from pre-splited file
2. self.labeled_data,self.labels: list of np.array consists of train data split from curated data + noisy data with guessing labels
3. self.unlabeld_data: list of np.array, noisy data without guessing labels
"""
self.labeled_data = []
self.unlabeld_data = []
self.labels = np.array([]).reshape(0,self.num_classes)
# all curated data should be labeled data
curated_labels = np.array([]).reshape(0,self.num_classes)
for i, row in enumerate(self.curated_csv_df['labels'].str.split(',')):
onesample_label = np.zeros([1,self.num_classes])
for label in row:
idx = self.labels_str.index(label)
onesample_label[0,idx] = 1
curated_labels = np.vstack((curated_labels,onesample_label))
assert len(self.curated_data) == curated_labels.shape[0],'curated data and labels len not match'
# get val labels
self.val_labels = np.array([]).reshape(0,self.num_classes)
for i, row in enumerate(self.val_csv_df['labels'].str.split(',')):
onesample_label = np.zeros([1,self.num_classes])
for label in row:
idx = self.labels_str.index(label)
onesample_label[0,idx] = 1
self.val_labels = np.vstack((self.val_labels,onesample_label))
# add split out training curated data into labeled_data
self.labeled_data += self.curated_data
self.labels = np.vstack((self.labels,curated_labels))
# only parts of noisy data can be labeled data
for i,row in enumerate(self.noisy_csv_df['labels'].str.split(',')):
if type(row) != type([]):# which means row is NaN
self.unlabeld_data.append(self.noisy_data[i])
else:
self.labeled_data.append(self.noisy_data[i])
onesample_label = np.zeros([1,self.num_classes])
for label in row:
idx = self.labels_str.index(label)
onesample_label[0,idx] = 1
self.labels = np.vstack((self.labels,onesample_label))
assert len(self.labeled_data) == self.labels.shape[0],'labeled data len {} != labels len {}'.format(len(self.labeled_data),self.labels.shape[0])
print ('labeled data:{}, unlabeld_data:{}'.format(len(self.labeled_data),len(self.unlabeld_data)))
def select_datafromlist(self,data,idx):
# data: list(np.array)
# idx: np.array [N,]
return [data[int(i)] for i in idx]
def shuffle_and_cut(self,):
"""
since labeled and unlabeled data will be used in same batch half half,
we need to make sure the number of labeled and unlabeled data are with same size in one epoch
this function are required to be used before begining each epoch
"""
# shuffle labeled data
shuffle_idx = np.random.permutation(len(self.labeled_data))
self.labeled_data,self.labels = self.select_datafromlist(self.labeled_data,shuffle_idx),self.labels[shuffle_idx]
# shuffle unlabeled data
shuffle_idx = np.random.permutation(len(self.unlabeld_data))
self.unlabeld_data = self.select_datafromlist(self.unlabeld_data,shuffle_idx)
oneEpochLen = min(len(self.labeled_data),len(self.unlabeld_data))
self.labeled_data_epoch,self.labels_epoch = self.labeled_data[:oneEpochLen],self.labels[:oneEpochLen]
self.unlabeld_data_epoch = self.unlabeld_data[:oneEpochLen]
def cut_from_onedata(self,data):
# data: np.array [128,x,3]
# result: np.array [m,3,duration,128]
# m: int how many samples cut from the original data
m = int(math.ceil(data.shape[1]/self.duration))
# padding
res = np.zeros([data.shape[0],self.duration*m,data.shape[2]])
res[:,:data.shape[1],:] = data
res = np.swapaxes(res,0,2) #[3,duration*m,128]
res = np.reshape(res,[res.shape[0],m,self.duration,res.shape[2]])#[3,m,duration,128]
res = np.swapaxes(res,0,1) #[m,3,duration,128]
return res,m
def cut_from_batchdata(self,batch_data):
# batch_data: list of np.array batch_size * [128,x,3]
# result : np.array [M, 3, duration, 128] , where M could be varied in different batch
# subsamplenum_list: list(int) how many samples are cut from each original sample in one batch
res = np.array([]).reshape(0,batch_data[0].shape[2],self.duration,batch_data[0].shape[0])
subsamplenum_list = []
for data in batch_data:
cut_data,m = self.cut_from_onedata(data)
res = np.vstack((res,cut_data))
subsamplenum_list.append(m)
return res,subsamplenum_list
def spec_augment(self,data_batch):
"""
doing stochastic data augmentation on each data sample
reference: https://arxiv.org/pdf/1904.08779.pdf
data_batch : list of np.array
return :
data_batch_cp: list of np.array, which has been data augmented
"""
h_percentage = np.random.uniform(low=0., high=self.masking_max_percentage, size=len(data_batch))
w_percentage = np.random.uniform(low=0., high=self.masking_max_percentage, size=len(data_batch))
data_batch_cp = copy.deepcopy(data_batch) # to avoid change the original data, we have to use deepcopy [batch_size,128,x,3]
for i in range(len(data_batch_cp)):
height = data_batch_cp[i].shape[0]
width = data_batch_cp[i].shape[1]
h_mask = int(h_percentage[i] * height)
h = int(np.random.uniform(0.,height-h_mask))
data_batch_cp[i][h:h+h_mask,:,:] = 0
w_mask = int(w_percentage[i]*width)
w = int(np.random.uniform(0., width - w_mask))
data_batch_cp[i][:,w:w+w_mask,:] = 0
return data_batch_cp
def get_data(self,batchidx,istraining=True):
"""
get a batch of data consists of both labeled_data & unlabeld_data(half half) for training process
and get a batch data consists of only curated data for validation process
batchidx: int
"""
if istraining:
# labeled data part
batch_labeled_data = self.spec_augment(self.labeled_data_epoch[batchidx*self.batch_size:(batchidx+1)*self.batch_size]) #list of np.array: batch_size * np.array[128,x,3]
labels = self.labels_epoch[batchidx*self.batch_size:(batchidx+1)*self.batch_size,:] #np.array[batch_size,num_classes]
cut_batch_labeled_data,labeled_data_subsamplenum_list = self.cut_from_batchdata(batch_labeled_data)
# unlabeled data part
batch_unlabeled_data = self.spec_augment(self.unlabeld_data_epoch[batchidx*self.batch_size:(batchidx+1)*self.batch_size])
cut_batch_unlabeled_data,unlabeled_data_subsamplenum_list = self.cut_from_batchdata(batch_unlabeled_data)
# labels for this batch, where unlabeled data's label will be all zeros
assert len(batch_labeled_data) == len(batch_unlabeled_data), 'labeled data and unlabeled data size not equal'
batch_labels = np.zeros([labels.shape[0]*2,labels.shape[1]])
batch_labels[:labels.shape[0],:] = labels
batch_labels[labels.shape[0]:,0] = -1 # mark the first element of labels as -1 to indicate it is unlabeled data
# cuted data for this batch
cut_batch_data = np.concatenate((cut_batch_labeled_data,cut_batch_unlabeled_data),axis=0)
# samplenum list for this batch
subsamplenum_list = labeled_data_subsamplenum_list + unlabeled_data_subsamplenum_list
else:
data,labels = self.val_data,self.val_labels
batch_data = data[batchidx*self.batch_size:(batchidx+1)*self.batch_size] #list of np.array: batch_size * np.array[128,x,3]
batch_labels = labels[batchidx*self.batch_size:(batchidx+1)*self.batch_size,:] #np.array[batch_size,num_classes]
cut_batch_data,subsamplenum_list = self.cut_from_batchdata(batch_data)
return torch.from_numpy(cut_batch_data).float(),subsamplenum_list,torch.from_numpy(batch_labels).float()