-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
1 changed file
with
263 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
# -*- coding: utf-8 -*- | ||
"""NLP_HW1 (1).ipynb | ||
Automatically generated by Colaboratory. | ||
Original file is located at | ||
https://colab.research.google.com/drive/1pkvdZKeCVAuCEeSNkzFnVkFF6ct18Arv | ||
""" | ||
|
||
from __future__ import division | ||
import argparse | ||
import pandas as pd | ||
|
||
# useful stuff | ||
import numpy as np | ||
from scipy.special import expit | ||
from sklearn.preprocessing import normalize | ||
|
||
import re | ||
import pickle | ||
|
||
def text2sentences(path): | ||
# feel free to make a better tokenization/pre-processing | ||
sentences = [] | ||
with open(path) as f: | ||
for l in f: | ||
#sentences.append( l.lower().split() ) | ||
#reprocessing: | ||
## remove parenthesis | ||
clean_text = re.sub(r"\([^)]*\)", "", l) | ||
## tokenlize | ||
for paragraph in clean_text.split('\n'): | ||
if paragraph: | ||
for sentence in paragraph.split('.'): | ||
if sentence: | ||
clean_sent = re.sub(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+\ *", " ", sentence) | ||
tokens = clean_sent.lower().split() | ||
sentences.append(tokens) | ||
|
||
return sentences | ||
#所以 text2sentences return list of list of words | ||
|
||
def loadPairs(path): | ||
data = pd.read_csv(path, delimiter='\t') | ||
pairs = zip(data['word1'],data['word2'],data['similarity']) | ||
return pairs | ||
|
||
class SkipGram: | ||
def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize = 5, minCount = 5): | ||
self.w2id = None # word to ID mapping | ||
self.word_freq = {} | ||
self.idx_prob = None | ||
self.trainset = sentences # list of text-ordered words | ||
#self.vocab = list(set(sentences)) # list of valid words | ||
self.nEmbed = nEmbed | ||
self.winSize = winSize | ||
self.minCount = max(minCount, 1) | ||
self.negativeRate = negativeRate | ||
#self.epochs = epochs | ||
self.Ei = None | ||
self.Eo = None | ||
#不确定的部位 | ||
self.trainWords = 1 #0 | ||
self.accLoss = 0 | ||
self.loss = [] | ||
|
||
|
||
|
||
def build_w2id(self): | ||
"""build word_to_index dictionary and word_freq dictionary""" | ||
# Iterate over each sentence in the training set | ||
for sent in self.trainset: | ||
# Iterate over each word in a sentence | ||
for word in sent: | ||
# count the frequence of each word | ||
self.word_freq[word] = self.word_freq.get(word, 0) + 1 | ||
|
||
# Remove words whose frequency are less than the number of minCount | ||
self.word_freq = {word:freq for word, freq in self.word_freq.items() if freq >= self.minCount} | ||
|
||
# Create w2id dictionary with word as key, index as value | ||
self.w2id = {w: idx for (idx, w) in enumerate(self.word_freq.keys())} | ||
|
||
def comput_prob(self, power=0.75): | ||
"""calculate the Probability distributions of each word""" | ||
idx_prob = np.zeros(len(self.w2id)) | ||
for word, frequency in self.word_freq.items(): | ||
#(Empirically work best)taking power of three-forth of frequence of words | ||
#to sample negative words between emperical distribution and uniform distribution. | ||
f = frequency ** power | ||
idx_prob[self.w2id[word]] = f | ||
self.idx_prob = idx_prob / np.sum(idx_prob) | ||
|
||
def sample(self, omit): | ||
|
||
omit_indices = list(omit) | ||
probabilities = np.copy(self.idx_prob) | ||
probabilities[tuple([omit_indices])] = 0 | ||
probabilities /= np.sum(probabilities) | ||
negative_samples = np.random.choice(len(self.idx_prob), size=self.negativeRate, p=probabilities) | ||
|
||
return negative_samples | ||
|
||
#所以这里没有考虑epoch,就一气全部train。 | ||
def train(self,step_size=0.001,save_model_path=None): | ||
#self.compute_word2idx_and_unigram() | ||
self.build_w2id() | ||
self.comput_prob() | ||
|
||
V = len(self.w2id) | ||
Ei = np.random.rand(self.nEmbed, V) | ||
Eo = np.random.rand(V, self.nEmbed) | ||
loss_best = 1e100 | ||
loss = [] | ||
accLoss = 0 | ||
trainWords = 0 | ||
|
||
#如果要进行epoch,这里写大loop | ||
#剩余所有缩进+1 | ||
|
||
# Iterate over each sentence in the training set | ||
for counter, sentence in enumerate(self.trainset): | ||
sentence = list(filter(lambda word: word in self.w2id.keys(), sentence))#filter the word exits in the w2idx dictionary | ||
|
||
# Iterate over each word in a sentence | ||
for wpos, word in enumerate(sentence): | ||
wIdx = self.w2id[word] | ||
winsize = np.random.randint(self.winSize) + 1 | ||
start = max(0, wpos - winsize) | ||
end = min(wpos + winsize + 1, len(sentence))#set start position and end position of the window for each word | ||
|
||
#Iterate over each context word for a word | ||
for context_word in sentence[start:end]: | ||
ctxtId = self.w2id[context_word] | ||
if ctxtId == wIdx: continue #skip the word itself | ||
negativeIds = self.sample({wIdx, ctxtId}) | ||
self.trainWord(wIdx, ctxtId, negativeIds, Ei, Eo, V, accLoss, step_size=0.001) | ||
self.trainWords += 1 | ||
|
||
#for every 1000 sentences trained | ||
if counter % 1000 == 0: | ||
#print (' > training %d of %d' % (counter, len(self.trainset)) | ||
#save weight here for every 1000sentences | ||
|
||
self.loss.append(self.accLoss/self.trainWords) | ||
self.trainWords = 1 #if=0,then divided by 0 | ||
self.accLoss = 0 | ||
|
||
if self.loss[-1]<loss_best: | ||
loss_best =self.loss[-1] | ||
self.Ei = Ei | ||
self.Eo = Eo | ||
if save_model_path is not None: | ||
self.save(save_model_path) | ||
|
||
def trainWord(self, wordId, contextId, negativeIds, Ei, Eo, V, accLoss, step_size=0.001): | ||
|
||
ei=Ei[:,wordId] | ||
eo=Eo[contextId,:] | ||
en=Eo[negativeIds,:] | ||
|
||
# intermediate values that are helpful | ||
cos_p = expit(-np.dot(eo, ei)) | ||
cos_n = expit(np.dot(en, ei)) | ||
# print('S_neg.shape =', s_neg.shape) | ||
|
||
# Compute partial derivatives | ||
#这里定义的不是标准意义上的偏导数,看看到时候要不要改。 | ||
dei = -cos_p*eo + np.dot(cos_n, en) | ||
deo = -cos_p*ei | ||
den = np.outer(cos_n, ei) | ||
|
||
# Gradient descent update | ||
ei -= step_size*dei | ||
eo -= step_size*deo | ||
en -= step_size*den | ||
|
||
Ei[:,wordId] = ei | ||
Eo[contextId,:] = eo | ||
Eo[negativeIds,:] = en | ||
|
||
|
||
loss_word = -np.log(expit(np.dot(eo, ei))) + np.sum(-np.log(expit(-np.dot(en, ei)))) | ||
accLoss+= loss_word | ||
#raise NotImplementedError('here is all the fun!') | ||
|
||
def save(self,path): | ||
data = {'w2id': self.w2id, | ||
'Ei': self.Ei, | ||
'Eo': self.Eo, | ||
'negativeRate': self.negativeRate, | ||
'nEmbed': self.nEmbed, | ||
'winSize': self.winSize, | ||
'minCount': self.minCount} | ||
with open(path, 'wb') as f: | ||
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) | ||
# raise NotImplementedError('implement it!') | ||
|
||
def similarity(self,word1,word2): | ||
""" | ||
computes similiarity between the two words. unknown words are mapped to one common vector | ||
:param word1: | ||
:param word2: | ||
:return: a float \in [0,1] indicating the similarity (the higher the more similar) | ||
""" | ||
idx1 = self.w2id.get(word1, 0) | ||
idx2 = self.w2id.get(word2, 0) | ||
|
||
# Get learned embedding vectors | ||
w1 = self.Ei[:, idx1] | ||
w2 = self.Eo[idx2, :] | ||
|
||
# Calculate cosine similarity score | ||
norm1 = np.linalg.norm(w1) | ||
norm2 = np.linalg.norm(w2) | ||
score = np.dot(w1, w2)/ (norm1 * norm2) | ||
|
||
return score | ||
|
||
|
||
@staticmethod | ||
def load(path): | ||
with open(path, "rb") as f: | ||
data = pickle.load(f) | ||
sg = SkipGram(sentences=None, | ||
nEmbed=data['nEmbed'], | ||
negativeRate=data['negativeRate'], | ||
winSize=data['winSize'], | ||
minCount=data['minCount']) | ||
sg.Ei = data['Ei'] | ||
sg.Eo = data['Eo'] | ||
sg.w2id = data['w2id'] | ||
return sg | ||
|
||
#sentences = text2sentences('NLP_1.txt') | ||
# sg = SkipGram(s) | ||
# sg.train(save_model_path="good_result",step_size = 0.001) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--text', help='path containing training data', required=True) | ||
parser.add_argument('--model', help='path to store/read model (when training/testing)', required=True) | ||
parser.add_argument('--test', help='enters test mode', action='store_true') | ||
|
||
opts = parser.parse_args() | ||
|
||
if not opts.test: | ||
sentences = text2sentences(opts.text) | ||
sg = SkipGram(sentences) | ||
sg.train(save_model_path=opts.model) | ||
sg.save(opts.model) | ||
|
||
else: | ||
pairs = loadPairs(opts.text) | ||
|
||
sg = SkipGram.load(opts.model) | ||
for a,b,_ in pairs: | ||
# make sure this does not raise any exception, even if a or b are not in sg.vocab | ||
print(sg.similarity(a,b)) | ||
|