Add files via upload

word2vec中的skip gram
CruiseQiaoQiao · Apr 19, 2021 · d67c966 · d67c966
1 parent 4a8b0b7
commit d67c966
Showing 1 changed file with 263 additions and 0 deletions.
diff --git a/skipGram_new.py b/skipGram_new.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+"""NLP_HW1 (1).ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1pkvdZKeCVAuCEeSNkzFnVkFF6ct18Arv
+"""
+
+from __future__ import division
+import argparse
+import pandas as pd
+
+# useful stuff
+import numpy as np
+from scipy.special import expit
+from sklearn.preprocessing import normalize
+
+import re
+import pickle
+
+def text2sentences(path):
+    # feel free to make a better tokenization/pre-processing
+    sentences = []
+    with open(path) as f:
+        for l in f:
+            #sentences.append( l.lower().split() )
+            #reprocessing:
+            ## remove parenthesis 
+            clean_text = re.sub(r"\([^)]*\)", "", l)
+            ## tokenlize
+            for paragraph in clean_text.split('\n'):
+                if paragraph:
+                    for sentence in paragraph.split('.'):
+                        if sentence:
+                            clean_sent = re.sub(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+\ *", " ", sentence)
+                            tokens = clean_sent.lower().split()
+                            sentences.append(tokens)
+
+    return sentences
+#所以 text2sentences return list of list of words
+
+def loadPairs(path):
+    data = pd.read_csv(path, delimiter='\t')
+    pairs = zip(data['word1'],data['word2'],data['similarity'])
+    return pairs
+
+class SkipGram:
+    def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize = 5, minCount = 5):
+        self.w2id = None # word to ID mapping
+        self.word_freq = {}
+        self.idx_prob = None
+        self.trainset = sentences # list of text-ordered words 
+        #self.vocab = list(set(sentences)) # list of valid words
+        self.nEmbed = nEmbed
+        self.winSize = winSize
+        self.minCount = max(minCount, 1)
+        self.negativeRate = negativeRate
+        #self.epochs = epochs
+        self.Ei = None
+        self.Eo = None
+        #不确定的部位
+        self.trainWords = 1 #0
+        self.accLoss = 0
+        self.loss = []
+
+
+
+    def build_w2id(self):
+        """build word_to_index dictionary and word_freq dictionary"""
+        # Iterate over each sentence in the training set
+        for sent in self.trainset:
+            # Iterate over each word in a sentence
+            for word in sent:
+                # count the frequence of each word
+                self.word_freq[word] = self.word_freq.get(word, 0) + 1
+
+        # Remove words whose frequency are less than the number of minCount
+        self.word_freq = {word:freq for word, freq in self.word_freq.items() if freq >= self.minCount}
+
+        # Create w2id dictionary with word as key, index as value
+        self.w2id = {w: idx for (idx, w) in enumerate(self.word_freq.keys())}
+
+    def comput_prob(self, power=0.75):
+        """calculate the Probability distributions of each word""" 
+        idx_prob = np.zeros(len(self.w2id))
+        for word, frequency in self.word_freq.items():
+            #(Empirically work best)taking power of three-forth of frequence of words 
+            #to sample negative words between emperical distribution and uniform distribution.
+            f = frequency ** power 
+            idx_prob[self.w2id[word]] = f
+        self.idx_prob = idx_prob / np.sum(idx_prob)
+
+    def sample(self, omit):
+
+        omit_indices = list(omit)
+        probabilities = np.copy(self.idx_prob)
+        probabilities[tuple([omit_indices])] = 0
+        probabilities /= np.sum(probabilities)
+        negative_samples = np.random.choice(len(self.idx_prob), size=self.negativeRate, p=probabilities)
+
+        return negative_samples
+
+#所以这里没有考虑epoch，就一气全部train。
+    def train(self,step_size=0.001,save_model_path=None):
+        #self.compute_word2idx_and_unigram()
+        self.build_w2id()
+        self.comput_prob()
+
+        V = len(self.w2id)
+        Ei = np.random.rand(self.nEmbed, V)
+        Eo = np.random.rand(V, self.nEmbed)
+        loss_best = 1e100
+        loss = []
+        accLoss = 0
+        trainWords = 0
+
+        #如果要进行epoch，这里写大loop
+        #剩余所有缩进+1
+
+        # Iterate over each sentence in the training set
+        for counter, sentence in enumerate(self.trainset):
+            sentence = list(filter(lambda word: word in self.w2id.keys(), sentence))#filter the word exits in the w2idx dictionary
+
+            # Iterate over each word in a sentence
+            for wpos, word in enumerate(sentence):
+                wIdx = self.w2id[word]
+                winsize = np.random.randint(self.winSize) + 1
+                start = max(0, wpos - winsize)
+                end = min(wpos + winsize + 1, len(sentence))#set start position and end position of the  window for each word
+
+                #Iterate over each context word for a word
+                for context_word in sentence[start:end]:
+                    ctxtId = self.w2id[context_word]
+                    if ctxtId == wIdx: continue #skip the word itself
+                    negativeIds = self.sample({wIdx, ctxtId})
+                    self.trainWord(wIdx, ctxtId, negativeIds, Ei, Eo, V, accLoss, step_size=0.001)
+                    self.trainWords += 1
+
+            #for every 1000 sentences trained
+            if counter % 1000 == 0: 
+                #print (' > training %d of %d' % (counter, len(self.trainset))
+                #save weight here for every 1000sentences
+
+                self.loss.append(self.accLoss/self.trainWords)
+                self.trainWords = 1 #if=0,then divided by 0
+                self.accLoss = 0
+
+                if self.loss[-1]<loss_best:
+                    loss_best =self.loss[-1]
+                    self.Ei = Ei
+                    self.Eo = Eo
+        if save_model_path is not None:
+            self.save(save_model_path)
+
+    def trainWord(self, wordId, contextId, negativeIds, Ei, Eo, V, accLoss, step_size=0.001):
+
+        ei=Ei[:,wordId]
+        eo=Eo[contextId,:]
+        en=Eo[negativeIds,:]
+
+     # intermediate values that are helpful
+        cos_p = expit(-np.dot(eo, ei))
+        cos_n = expit(np.dot(en, ei))
+    # print('S_neg.shape =', s_neg.shape)
+
+    # Compute partial derivatives
+    #这里定义的不是标准意义上的偏导数，看看到时候要不要改。
+        dei = -cos_p*eo + np.dot(cos_n, en)
+        deo = -cos_p*ei
+        den = np.outer(cos_n, ei)
+
+    # Gradient descent update
+        ei -= step_size*dei
+        eo -= step_size*deo
+        en -= step_size*den
+
+        Ei[:,wordId] = ei
+        Eo[contextId,:] = eo
+        Eo[negativeIds,:] = en
+
+
+        loss_word = -np.log(expit(np.dot(eo, ei))) + np.sum(-np.log(expit(-np.dot(en, ei))))
+        accLoss+= loss_word
+        #raise NotImplementedError('here is all the fun!')
+
+    def save(self,path):
+        data = {'w2id': self.w2id,
+                'Ei': self.Ei,
+                'Eo': self.Eo,
+                'negativeRate': self.negativeRate,
+                'nEmbed': self.nEmbed,
+                'winSize': self.winSize,
+                'minCount': self.minCount}
+        with open(path, 'wb') as f:
+            pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
+       # raise NotImplementedError('implement it!')
+
+    def similarity(self,word1,word2):
+        """
+            computes similiarity between the two words. unknown words are mapped to one common vector
+        :param word1:
+        :param word2:
+        :return: a float \in [0,1] indicating the similarity (the higher the more similar)
+        """
+        idx1 = self.w2id.get(word1, 0)
+        idx2 = self.w2id.get(word2, 0)
+
+        # Get learned embedding vectors
+        w1 = self.Ei[:, idx1]
+        w2 = self.Eo[idx2, :]
+
+        # Calculate cosine similarity score
+        norm1 = np.linalg.norm(w1)
+        norm2 = np.linalg.norm(w2)
+        score = np.dot(w1, w2)/ (norm1 * norm2)
+
+        return score
+
+
+    @staticmethod
+    def load(path):
+        with open(path, "rb") as f:
+            data = pickle.load(f)
+        sg = SkipGram(sentences=None,
+                      nEmbed=data['nEmbed'],
+                      negativeRate=data['negativeRate'],
+                      winSize=data['winSize'],
+                      minCount=data['minCount'])
+        sg.Ei = data['Ei']
+        sg.Eo = data['Eo']
+        sg.w2id = data['w2id']
+        return sg
+
+#sentences = text2sentences('NLP_1.txt')
+# sg = SkipGram(s)
+# sg.train(save_model_path="good_result",step_size = 0.001)
+
+
+
+if __name__ == '__main__':
+
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--text', help='path containing training data', required=True)
+	parser.add_argument('--model', help='path to store/read model (when training/testing)', required=True)
+	parser.add_argument('--test', help='enters test mode', action='store_true')
+
+	opts = parser.parse_args()
+
+	if not opts.test:
+		sentences = text2sentences(opts.text)
+		sg = SkipGram(sentences)
+		sg.train(save_model_path=opts.model)
+		sg.save(opts.model)
+
+	else:
+		pairs = loadPairs(opts.text)
+
+		sg = SkipGram.load(opts.model)
+		for a,b,_ in pairs:
+            # make sure this does not raise any exception, even if a or b are not in sg.vocab
+			print(sg.similarity(a,b))
+