-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtrain_in_vain.py
251 lines (217 loc) · 11.1 KB
/
train_in_vain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/python
from nltk.stem.porter import *
from tokenizer import TreebankWordTokenizer
import gensim
import logging
from datetime import datetime
import gensim
import cython
import re
from os.path import exists
# Dear future Jeremy,
# you are going to want to know that there are > 54,908,750 sentences
# in the corpus.
# this setup trains a phrase2vec trigram model to join "new", "york", "times" as "new_york_times"
# using a (nonrandom) sample of 10% of the corpus
# and then trains a word2vec model on the whole corpus.
# if you want to play with the phrases models, here's what to c/p into python
# import gensim
# bigrams_model_name = "bigrams_model.bin"
# bigrams_model = gensim.models.phrases.Phraser.load(bigrams_model_name)
# trigrams_model_name = "trigrams_model.bin"
# trigrams_model = gensim.models.phrases.Phraser.load(trigrams_model_name)
# trigrams_model[bigrams_model[["hillary", "rodham"]]
# the POS-tagged corpora were generated from sentences.txt
# with ./stanford-postagger.sh models/wsj-0-18-left3words-distsim.tagger ~/code/ineffable_wizardry_of_cleaning/nyt_sentences_5.5M.txt > ~/code/ineffable_wizardry_of_cleaning/nyt_sentences_5.5M_tagged.txt
# at 19k words per second, the 5.5M sentence sample took ~1.5 hours.
stemming = False
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# sentences_filename = "eng_news_2013_3M/eng_news_2013_3M-sentences.txt"
sentences_filename = "nyt_sentences_tagged.txt" # should really be nyt/taggerAPI/sentences.txt but that takes 6x as long to train versus nyt_sentences_5.5M.txt
smaller_sentences_filename = "nyt_sentences_5.5M_tagged.txt"
pos_tagged = 'tagged' in sentences_filename
start = datetime.now()
print("start training w2v " + str(start))
# import re
# apostrophe_tests = {
# "'ere" : False,
# "'geez": False,
# "can't" : True,
# "would've" : True,
# "cheese": True,
# "eatin'" : True,
# "''what" : False,
# "''what''" : False,
# "that''" : False
# }
# for test in apostrophe_tests.keys():
# if not bool(alpha_re.match(test)) == apostrophe_tests[test]:
# print(test)
# an iterator, via http://rare-technologies.com/word2vec-tutorial/
class MySentences(object):
def __init__(self, filename):
self.filename = filename
self.alpha_re = re.compile("^[a-zA-Z]+'?[a-zA-Z]*?$") # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere
if stemming:
self.stemmer = PorterStemmer()
self.treebank_word_tokenizer = TreebankWordTokenizer()
# TODO: use http://www.nltk.org/howto/stem.html
def __iter__(self):
for line in open(self.filename):
# TODO find a better way to distinguish sentence-initial caps from proper noun
# sentences come like this:
# 80 10:11 p.m., an unwanted person was reported on College Avenue.
# 81 10:13 a.m., a report of shoplifting was investigated at Maine Smoke Shop on College Avenue.
# 82 10:14: The proportion of A-levels awarded at least an A grade has fallen for the second year in a row.
# 141529 But the debt ceiling may end up being the larger inflection point, especially as Obama staked out a hard-lined position against negotiating over that vote.
sentence = line.split("\t", 1)[-1].replace(".", ' ')
words = [word.lower() for word in self.treebank_word_tokenizer.tokenize(sentence) if re.match(self.alpha_re, word.split("_")[0]) ]
if stemming:
stems = [self.stemmer.stem(word) for word in words]
yield stems
else:
yield words
smaller_sentences = MySentences(smaller_sentences_filename) # a memory-friendly iterator
bigrams_threshold = 5 if pos_tagged else 15
trigrams_threshold = 5 if pos_tagged else 10 # new york times is 11.1
bigrams_max_vocab_size = 10 * 1000 * 1000
trigrams_max_vocab_size = 10 * 1000 * 1000
try:
bigrams_model_name = "bigrams_model_%(input_filename)s_%(threshold)i_%(max_vocab_size)i.bin" % {
'input_filename': '.'.join(smaller_sentences_filename.split("/")[-1].split(".")[:-1]),
'threshold': bigrams_threshold,
'max_vocab_size': bigrams_max_vocab_size
}
except:
bigrams_model_name = "bigrams_model.bin"
if exists(bigrams_model_name):
bigrams_model = gensim.models.phrases.Phraser.load(bigrams_model_name)
else:
bigrams_model_phrases = gensim.models.Phrases(smaller_sentences, threshold=bigrams_threshold, max_vocab_size=bigrams_max_vocab_size)
bigrams_model_phrases.save(bigrams_model_name)
bigrams_model = gensim.models.phrases.Phraser(bigrams_model_phrases)
we_should_save_the_bigrams = False
if we_should_save_the_bigrams:
bigrams_so_far = set()
with open(bigrams_model_name + ".txt", 'a') as f:
for phrase, score in bigrams_model_phrases.export_phrases(smaller_sentences):
if phrase == "new york":
print("{}\t{}\n".format(phrase, score))
if phrase not in bigrams_so_far:
f.write("{}\t{}\n".format(phrase, score))
bigrams_so_far.add(phrase)
smaller_sentences = MySentences(smaller_sentences_filename) # a memory-friendly iterator
try:
trigrams_model_name = "trigrams_model_%(input_filename)s_%(threshold)i_%(max_vocab_size)i.bin" % {
'input_filename': '.'.join(smaller_sentences_filename.split("/")[-1].split(".")[:-1]),
'threshold': trigrams_threshold,
'max_vocab_size': trigrams_max_vocab_size
}
except:
trigrams_model_name = "trigrams_model.bin"
if exists(trigrams_model_name):
trigrams_model = gensim.models.phrases.Phraser.load(trigrams_model_name)
else:
trigrams_model_phrases = gensim.models.Phrases(bigrams_model[smaller_sentences], threshold=trigrams_threshold, max_vocab_size=trigrams_max_vocab_size)
trigrams_model_phrases.save(trigrams_model_name)
trigrams_model = gensim.models.phrases.Phraser(trigrams_model_phrases)
we_should_save_the_trigrams = False
if we_should_save_the_trigrams:
trigrams_so_far = set()
with open(trigrams_model_name + ".txt", 'a') as f:
for phrase, score in trigrams_model_phrases.export_phrases(bigrams_model[smaller_sentences]):
if phrase not in trigrams_so_far:
f.write("{}\t{}\n".format(phrase, score))
trigrams_so_far.add(phrase)
# sentences_with_phrases = [
# "Officials from Libya's moderate governing coalition were demanding that the United States stop the wealthy nation of Qatar from sending money and arms to militias aligned with Libya's Islamist political bloc.",
# "The Islamists, in turn, were accusing a rival gulf power, the United Arab Emirates, of providing similar patronage to fighters aligned with their political enemies.",
# "The appeal from Mitt Romney and the furious reaction to it captured the essence of the party's schism over Donald J. Trump."
# ]
# alpha_re = re.compile("^[a-zA-Z]+'?[a-zA-Z]*$")
# my_treebank_word_tokenizer = TreebankWordTokenizer()
# tokenized_sentences = []
# for line in sentences_with_phrases:
# sentence = line.decode("UTF8").split("\t", 1)[-1]
# words = [word.lower() for word in my_treebank_word_tokenizer.tokenize(sentence) if re.match(alpha_re, word) ]
# tokenized_sentences.append(words)
# # for sent in tokenized_sentences:
# # print(bigrams_model[sent])
# # trigram = gensim.models.Phrases(bigrams_model[sentences])
# for sent in tokenized_sentences:
# print(trigrams_model[sent])
# raise Exception
# an iterator, via http://rare-technologies.com/word2vec-tutorial/
class SentencesToNgrammify(object):
def __init__(self, filename, ngramlambda=None):
self.filename = filename
self.alpha_re = re.compile("^[a-zA-Z]+'?[a-zA-Z]*?$") # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere
if stemming:
self.stemmer = PorterStemmer()
self.ngramlambda = ngramlambda
self.treebank_word_tokenizer = TreebankWordTokenizer()
# TODO: use http://www.nltk.org/howto/stem.html
def __iter__(self):
for line in open(self.filename):
# TODO find a better way to distinguish sentence-initial caps from proper noun
# sentences come like this:
# 80 10:11 p.m., an unwanted person was reported on College Avenue.
# 81 10:13 a.m., a report of shoplifting was investigated at Maine Smoke Shop on College Avenue.
# 82 10:14: The proportion of A-levels awarded at least an A grade has fallen for the second year in a row.
# 141529 But the debt ceiling may end up being the larger inflection point, especially as Obama staked out a hard-lined position against negotiating over that vote.
sentence = line.split("\t", 1)[-1].replace(".", ' ')
words = [word.lower() for word in self.treebank_word_tokenizer.tokenize(sentence) if re.match(self.alpha_re, word.split("_")[0]) ]
if stemming:
stems = [self.stemmer.stem(word) for word in words]
yield self.ngramlambda(stems)
else:
yield self.ngramlambda(words)
class NgrammedSentences(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for line in open(self.filename, 'r'):
yield line.strip().split(" ")
ngrams_models = {
"bigrams": lambda x: bigrams_model[x],
"trigrams": lambda x: trigrams_model[bigrams_model[x]],
}
ngrams_model = "trigrams"
min_count = 10 # was 10, also 50
size = 200
downsampling = 1e-3 # has variously been 1e-3 and 0
use_skipgrams = False
sentences = None
ngrammed_sentences_filename = "ngrammed_sentences_%s_%s_%s.txt" % ('.'.join(sentences_filename.split("/")[-1].split(".")[:-1]), "stemmed" if stemming else "raw_words", ngrams_model)
if exists(ngrammed_sentences_filename):
print("loading sentences from pre-phrasified file: {}".format(ngrammed_sentences_filename))
sentences = NgrammedSentences(ngrammed_sentences_filename)
else:
sentences = SentencesToNgrammify(sentences_filename, ngrams_models[ngrams_model])
print("writing phrasified sentences to file: {}".format(ngrammed_sentences_filename))
with open(ngrammed_sentences_filename, 'a') as f:
for s in sentences:
f.write(' '.join(s) + "\n")
sentences = NgrammedSentences(ngrammed_sentences_filename)
if True:
model = gensim.models.Word2Vec(
sentences,
workers=4,
min_count=min_count,
size=size,
sample=downsampling,
# sg=(1 if use_skipgrams else 0)
)
# model.init_sims(replace=True)
try:
model_name = "model_%s_%s_%s_min_count_%s_size_%s_downsampling_%s_%s.bin" % ('.'.join(sentences_filename.split("/")[-1].split(".")[:-1]), "stemmed" if stemming else "raw_words", ngrams_model, min_count, size, downsampling, "sg" if use_skipgrams else "cbow")
except:
model_name = "model.bin"
model.save(model_name)
print(model.most_similar(positive=["pizza", "atlanta"], negative=["brooklyn"], topn=20))
with open("most_recent_model_filename.txt", "w") as f:
f.write(model_name)
print(model.most_similar_cosmul(positive=['woman', 'king'], negative=['man']))
print("finish training w2v" + str(datetime.now()))
print("training w2v took {} seconds ({} minutes)".format((datetime.now() - start).seconds, (datetime.now() - start).seconds / 60))
# TODO: test more via https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors