-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathdata_helper.py
259 lines (226 loc) · 11.2 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import os
import codecs
import math
import random
import numpy as np
from tqdm import tqdm
from collections import Counter
import gensim
PAD = "<pad>"
UNK = "<unk>"
# cf. https://courses.washington.edu/hypertxt/csar-v02/penntable.html
# cf. http://wacky.sslmit.unibo.it/lib/exe/fetch.php?media=tagsets:ukwac_tagset.txt
verb_pos = ["VB", "VBZ", "VBP", "VBD", "VBN", "VBG", "VV", "VVN", "VVG", "VVD", "VVZ", "VVP", "VH", "VHP", "VHZ", "VHD",
"VHG", "VHN"]
separate = "\t"
neg_separate = " , "
word2vec_path = os.path.join(os.path.expanduser("~"), "data", "embeddings", "GoogleNews-vectors-negative300.bin")
glove_path = os.path.join(os.path.expanduser("~"), "data", "embeddings", "glove", "glove.840B.300d.txt")
def build_word2vec(filename, save_path, word_dict, dim):
scale = np.sqrt(3.0 / dim)
embeddings = np.random.uniform(-scale, scale, [len(word_dict), dim])
if filename is None:
filename = word2vec_path
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True)
for word in w2v_model.wv.vocab:
if word in word_dict:
idx = word_dict[word]
embeddings[idx] = w2v_model.wv[word]
elif word.lower() in word_dict:
idx = word_dict[word.lower()]
embeddings[idx] = w2v_model.wv[word]
np.savez_compressed(save_path, embeddings=embeddings)
def build_glove(filename, context_path, target_path, word_dict, verb_dict, dim):
scale = np.sqrt(3.0 / dim)
context_emb = np.random.uniform(-scale, scale, [len(word_dict), dim])
target_emb = np.random.uniform(-scale, scale, [len(verb_dict), dim])
with codecs.open(filename, mode="r", encoding="utf-8") as f:
for line in tqdm(f, total=int(2.2e6), desc="Load glove embeddings"):
line = line.lstrip().rstrip().split(" ")
word = line[0]
vector = [float(x) for x in line[1:]]
# for context embedding
if word in word_dict:
idx = word_dict[word]
context_emb[idx] = np.asarray(vector)
elif word.lower() in word_dict:
idx = word_dict[word.lower()]
context_emb[idx] = np.asarray(vector)
# for target embedding
if word in verb_dict:
idx = verb_dict[word]
target_emb[idx] = np.asarray(vector)
elif word.lower() in verb_dict:
idx = verb_dict[word.lower()]
target_emb[idx] = np.asarray(vector)
np.savez_compressed(context_path, embeddings=context_emb)
np.savez_compressed(target_path, embeddings=target_emb)
def ukwac_corpus_iterator(ukwac_path=None, lowercase=True):
if ukwac_path is None:
ukwac_path = os.path.join(os.path.expanduser("~"), "data", "ukwac", "ukwac_pos", "pos_text")
ukwac_file = os.path.join(ukwac_path, "UKWAC-{}.xml")
if not os.path.exists(ukwac_path):
raise IOError("Unable to find the corpus directory: %s" % ukwac_path)
file_indices = list(range(1, 26)) # 26
for file_index in file_indices:
words, poss, lemmas = [], [], []
total = int(3.34e7) if file_index == 25 else int(1e8)
with codecs.open(ukwac_file.format(file_index), mode="r", encoding="utf-8", errors="ignore") as f:
for line in tqdm(f, total=total, desc="Read UKWAC-{}.xml".format(file_index)):
line = line.lstrip().rstrip()
if line.startswith("<text id") or line.startswith("<s>") or line.startswith("</text>"):
continue
if line.startswith("</s>"): # read one sentence
if len(words) > 80: # if sentence length is greater than 80, then ignore this sentence
words, poss, lemmas = [], [], []
continue
yield words, poss, lemmas
words, poss, lemmas = [], [], []
continue
if len(line.split("\t")) != 3:
continue
word, pos, lemma = line.split("\t")
words.append(word.strip().lower() if lowercase else word.strip())
poss.append(pos.strip())
lemmas.append(lemma.strip().lower() if lowercase else lemma.strip())
def build_dataset_and_vocab(save_path, word_threshold=90):
word_counter, verb_counter = Counter(), Counter()
save_file = codecs.open(os.path.join(save_path, "dataset.txt"), mode="w", encoding="utf-8")
for words, poss, _ in ukwac_corpus_iterator():
for idx, word in enumerate(words):
# count all words
word_counter[word] += 1
# count verbs
if poss[idx] in verb_pos:
verb_counter[word] += 1
# build dataset
if idx != 0 and idx != len(words) - 1 and poss[idx] in verb_pos:
left_context = " ".join(words[:idx])
right_context = " ".join(words[idx + 1:])
save_file.write(separate.join([left_context, word, right_context]) + "\n")
save_file.flush()
save_file.close()
# build word vocabulary
word_vocab = [PAD] + [word for word, count in word_counter.most_common() if count >= word_threshold] + [UNK]
# create negative sampling table
negative_table = build_negative_sampling_table(verb_counter, table_size=int(1e8), threshold=word_threshold)
# save to file
with codecs.open(os.path.join(save_path, "word_vocab.txt"), mode="w", encoding="utf-8") as f:
f.write("\n".join(word_vocab))
with codecs.open(os.path.join(save_path, "verb_count.txt"), mode="w", encoding="utf-8") as f:
f.write("\n".join(["{}\t{}".format(word, count) for word, count in verb_counter.most_common()]))
with codecs.open(os.path.join(save_path, "neg_table.txt"), mode="w", encoding="utf-8") as f:
f.write("\n".join(negative_table))
return negative_table
def build_negative_sampling_table(verb_freq_dict, table_size=int(1e8), threshold=90):
if isinstance(verb_freq_dict, str):
tmp_dict = Counter()
with codecs.open(verb_freq_dict, mode="r", encoding="utf-8") as f:
for line in f:
verb, count = line.strip().split("\t")
tmp_dict[verb] += int(count)
verb_freq_dict = tmp_dict.copy()
verb_freq_dict = dict([(verb, math.pow(count, 0.75)) for verb, count in verb_freq_dict.most_common()
if count >= threshold])
sum_freq = sum([freq for _, freq in verb_freq_dict.items()])
verb_freq_dict = dict([(verb, freq / sum_freq) for verb, freq in verb_freq_dict.items()])
negative_table = []
for verb, freq in verb_freq_dict.items():
negative_table += [verb] * int(freq * table_size)
random.shuffle(negative_table)
return negative_table
def negative_sampling(filename, file_out, negative_table, num_neg=10):
np.random.seed(123456)
if filename is None or not os.path.exists(filename):
raise IOError("Unable to find the dataset file: %s" % filename)
neg_size = len(negative_table)
f_out = codecs.open(file_out, mode="w", encoding="utf-8")
with codecs.open(filename, mode="r", encoding="utf-8", errors="ignore") as f:
for line in tqdm(f, desc="Read dataset"):
l_context, verb, r_context = line.strip().split("\t")
# negative sampling
neg_verbs = []
for _ in range(num_neg):
idx = np.random.randint(neg_size)
while negative_table[idx] == verb or negative_table[idx] in neg_verbs:
idx = np.random.randint(neg_size)
neg_verbs.append(negative_table[idx])
f_out.write(separate.join([l_context, verb, r_context, neg_separate.join(neg_verbs)]) + "\n")
f_out.flush()
f_out.close()
def negative_sample(verb, negative_table, neg_size, num_neg=10):
neg_verbs = []
for _ in range(num_neg):
idx = np.random.randint(neg_size)
while negative_table[idx] == verb or negative_table[idx] in neg_verbs:
idx = np.random.randint(neg_size)
neg_verbs.append(negative_table[idx])
return neg_verbs
def load_verb_count(filename):
count_list = []
with codecs.open(filename, "r", encoding="utf-8") as f:
for line in f:
line = line.strip().split("\t")
count = int(line[1])
if count < 90:
break
count_list.append(count)
return count_list
def read_vocab_to_dict(filename):
vocab = dict()
with codecs.open(filename, mode="r", encoding="utf-8") as f:
for idx, word in enumerate(f):
word = word.lstrip().rstrip()
vocab[word] = idx
return vocab
def pad_sequence(sequence, pad_tok=0, max_length=None):
"""Pad batched dataset with shape = (batch_size, seq_length(various))
:param sequence: input sequence
:param pad_tok: padding token, default is 0
:param max_length: max length of padded sequence, default is None
:return: padded sequence
"""
if max_length is None:
max_length = max([len(seq) for seq in sequence])
sequence_padded, seq_length = [], []
for seq in sequence:
seq_ = seq[:max_length] + [pad_tok] * max(max_length - len(seq), 0)
sequence_padded.append(seq_)
seq_length.append(min(len(seq), max_length))
return sequence_padded, seq_length
def build_batch_dataset(left_context, verbs, right_context):
left_context, left_seq_len = pad_sequence(left_context)
right_context, right_seq_len = pad_sequence(right_context)
batch_size = len(verbs)
return {"lc": left_context, "ll": left_seq_len, "rc": right_context, "rl": right_seq_len, "vb": verbs,
"batch_size": batch_size}
def dataset_iterator(dataset_file, word_dict, verb_dict, batch_size):
if dataset_file is None or not os.path.exists(dataset_file):
raise IOError("Unable to find the dataset file: %s" % dataset_file)
with codecs.open(dataset_file, mode="r", encoding="utf-8", errors="ignore") as f_dataset:
left_context, verbs, right_context = [], [], []
# for line in tqdm(f_dataset, total=int(2.663e8), desc="Read dataset"):
for line in f_dataset:
# split data
l_c, vb, r_c, _ = line.strip().split("\t")
# convert to indices
l_c = [word_dict[word] if word in word_dict else word_dict[UNK] for word in l_c.split(" ")]
vb = verb_dict[vb] if vb in verb_dict else verb_dict[UNK]
r_c = [word_dict[word] if word in word_dict else word_dict[UNK] for word in r_c.split(" ")]
# add to list
left_context.append(l_c)
verbs.append(vb)
right_context.append(r_c)
# yield batched dataset
if len(left_context) == batch_size:
yield build_batch_dataset(left_context, verbs, right_context)
left_context, verbs, right_context = [], [], []
if len(left_context) > 0:
yield build_batch_dataset(left_context, verbs, right_context)
if __name__ == "__main__":
# build dataset
# neg_table = build_dataset_and_vocab("data")
neg_table = build_negative_sampling_table("data/verb_count.txt")
# negative sampling for verbs
negative_sampling(os.path.join("data", "dataset.txt"), os.path.join("data", "dataset_neg.txt"), neg_table)
# build verb vocabulary