Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
word2vec中的skip gram
  • Loading branch information
CruiseQiaoQiao authored Apr 19, 2021
1 parent 4a8b0b7 commit d67c966
Showing 1 changed file with 263 additions and 0 deletions.
263 changes: 263 additions & 0 deletions skipGram_new.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
# -*- coding: utf-8 -*-
"""NLP_HW1 (1).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1pkvdZKeCVAuCEeSNkzFnVkFF6ct18Arv
"""

from __future__ import division
import argparse
import pandas as pd

# useful stuff
import numpy as np
from scipy.special import expit
from sklearn.preprocessing import normalize

import re
import pickle

def text2sentences(path):
# feel free to make a better tokenization/pre-processing
sentences = []
with open(path) as f:
for l in f:
#sentences.append( l.lower().split() )
#reprocessing:
## remove parenthesis
clean_text = re.sub(r"\([^)]*\)", "", l)
## tokenlize
for paragraph in clean_text.split('\n'):
if paragraph:
for sentence in paragraph.split('.'):
if sentence:
clean_sent = re.sub(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+\ *", " ", sentence)
tokens = clean_sent.lower().split()
sentences.append(tokens)

return sentences
#所以 text2sentences return list of list of words

def loadPairs(path):
data = pd.read_csv(path, delimiter='\t')
pairs = zip(data['word1'],data['word2'],data['similarity'])
return pairs

class SkipGram:
def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize = 5, minCount = 5):
self.w2id = None # word to ID mapping
self.word_freq = {}
self.idx_prob = None
self.trainset = sentences # list of text-ordered words
#self.vocab = list(set(sentences)) # list of valid words
self.nEmbed = nEmbed
self.winSize = winSize
self.minCount = max(minCount, 1)
self.negativeRate = negativeRate
#self.epochs = epochs
self.Ei = None
self.Eo = None
#不确定的部位
self.trainWords = 1 #0
self.accLoss = 0
self.loss = []



def build_w2id(self):
"""build word_to_index dictionary and word_freq dictionary"""
# Iterate over each sentence in the training set
for sent in self.trainset:
# Iterate over each word in a sentence
for word in sent:
# count the frequence of each word
self.word_freq[word] = self.word_freq.get(word, 0) + 1

# Remove words whose frequency are less than the number of minCount
self.word_freq = {word:freq for word, freq in self.word_freq.items() if freq >= self.minCount}

# Create w2id dictionary with word as key, index as value
self.w2id = {w: idx for (idx, w) in enumerate(self.word_freq.keys())}

def comput_prob(self, power=0.75):
"""calculate the Probability distributions of each word"""
idx_prob = np.zeros(len(self.w2id))
for word, frequency in self.word_freq.items():
#(Empirically work best)taking power of three-forth of frequence of words
#to sample negative words between emperical distribution and uniform distribution.
f = frequency ** power
idx_prob[self.w2id[word]] = f
self.idx_prob = idx_prob / np.sum(idx_prob)

def sample(self, omit):

omit_indices = list(omit)
probabilities = np.copy(self.idx_prob)
probabilities[tuple([omit_indices])] = 0
probabilities /= np.sum(probabilities)
negative_samples = np.random.choice(len(self.idx_prob), size=self.negativeRate, p=probabilities)

return negative_samples

#所以这里没有考虑epoch,就一气全部train。
def train(self,step_size=0.001,save_model_path=None):
#self.compute_word2idx_and_unigram()
self.build_w2id()
self.comput_prob()

V = len(self.w2id)
Ei = np.random.rand(self.nEmbed, V)
Eo = np.random.rand(V, self.nEmbed)
loss_best = 1e100
loss = []
accLoss = 0
trainWords = 0

#如果要进行epoch,这里写大loop
#剩余所有缩进+1

# Iterate over each sentence in the training set
for counter, sentence in enumerate(self.trainset):
sentence = list(filter(lambda word: word in self.w2id.keys(), sentence))#filter the word exits in the w2idx dictionary

# Iterate over each word in a sentence
for wpos, word in enumerate(sentence):
wIdx = self.w2id[word]
winsize = np.random.randint(self.winSize) + 1
start = max(0, wpos - winsize)
end = min(wpos + winsize + 1, len(sentence))#set start position and end position of the window for each word

#Iterate over each context word for a word
for context_word in sentence[start:end]:
ctxtId = self.w2id[context_word]
if ctxtId == wIdx: continue #skip the word itself
negativeIds = self.sample({wIdx, ctxtId})
self.trainWord(wIdx, ctxtId, negativeIds, Ei, Eo, V, accLoss, step_size=0.001)
self.trainWords += 1

#for every 1000 sentences trained
if counter % 1000 == 0:
#print (' > training %d of %d' % (counter, len(self.trainset))
#save weight here for every 1000sentences

self.loss.append(self.accLoss/self.trainWords)
self.trainWords = 1 #if=0,then divided by 0
self.accLoss = 0

if self.loss[-1]<loss_best:
loss_best =self.loss[-1]
self.Ei = Ei
self.Eo = Eo
if save_model_path is not None:
self.save(save_model_path)

def trainWord(self, wordId, contextId, negativeIds, Ei, Eo, V, accLoss, step_size=0.001):

ei=Ei[:,wordId]
eo=Eo[contextId,:]
en=Eo[negativeIds,:]

# intermediate values that are helpful
cos_p = expit(-np.dot(eo, ei))
cos_n = expit(np.dot(en, ei))
# print('S_neg.shape =', s_neg.shape)

# Compute partial derivatives
#这里定义的不是标准意义上的偏导数,看看到时候要不要改。
dei = -cos_p*eo + np.dot(cos_n, en)
deo = -cos_p*ei
den = np.outer(cos_n, ei)

# Gradient descent update
ei -= step_size*dei
eo -= step_size*deo
en -= step_size*den

Ei[:,wordId] = ei
Eo[contextId,:] = eo
Eo[negativeIds,:] = en


loss_word = -np.log(expit(np.dot(eo, ei))) + np.sum(-np.log(expit(-np.dot(en, ei))))
accLoss+= loss_word
#raise NotImplementedError('here is all the fun!')

def save(self,path):
data = {'w2id': self.w2id,
'Ei': self.Ei,
'Eo': self.Eo,
'negativeRate': self.negativeRate,
'nEmbed': self.nEmbed,
'winSize': self.winSize,
'minCount': self.minCount}
with open(path, 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
# raise NotImplementedError('implement it!')

def similarity(self,word1,word2):
"""
computes similiarity between the two words. unknown words are mapped to one common vector
:param word1:
:param word2:
:return: a float \in [0,1] indicating the similarity (the higher the more similar)
"""
idx1 = self.w2id.get(word1, 0)
idx2 = self.w2id.get(word2, 0)

# Get learned embedding vectors
w1 = self.Ei[:, idx1]
w2 = self.Eo[idx2, :]

# Calculate cosine similarity score
norm1 = np.linalg.norm(w1)
norm2 = np.linalg.norm(w2)
score = np.dot(w1, w2)/ (norm1 * norm2)

return score


@staticmethod
def load(path):
with open(path, "rb") as f:
data = pickle.load(f)
sg = SkipGram(sentences=None,
nEmbed=data['nEmbed'],
negativeRate=data['negativeRate'],
winSize=data['winSize'],
minCount=data['minCount'])
sg.Ei = data['Ei']
sg.Eo = data['Eo']
sg.w2id = data['w2id']
return sg

#sentences = text2sentences('NLP_1.txt')
# sg = SkipGram(s)
# sg.train(save_model_path="good_result",step_size = 0.001)



if __name__ == '__main__':

parser = argparse.ArgumentParser()
parser.add_argument('--text', help='path containing training data', required=True)
parser.add_argument('--model', help='path to store/read model (when training/testing)', required=True)
parser.add_argument('--test', help='enters test mode', action='store_true')

opts = parser.parse_args()

if not opts.test:
sentences = text2sentences(opts.text)
sg = SkipGram(sentences)
sg.train(save_model_path=opts.model)
sg.save(opts.model)

else:
pairs = loadPairs(opts.text)

sg = SkipGram.load(opts.model)
for a,b,_ in pairs:
# make sure this does not raise any exception, even if a or b are not in sg.vocab
print(sg.similarity(a,b))

0 comments on commit d67c966

Please sign in to comment.