-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcapstoneScraper.py
229 lines (216 loc) · 10.5 KB
/
capstoneScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# -*- coding: utf-8 -*-
import requests
from lxml import html
import numpy as np
import pandas as pd
import json
#import matplotlib.pyplot as plt
#import plotly.tools as plTools
#import plotly.plotly as py
#import plotly.graph_objs as go
#generates a dictionary of target words and weights
#adds it to linkDict with the key being sourceWord
def genTargetDict(sourceWord, linkDict, lang):
#target url for english and french
sURL = 'http://opus.lingfil.uu.se/lex.php'
tURL = 'http://opus.lingfil.uu.se/lex.php?l=fre'
#Using limited search for demo purposes (POST does not include all=1)
#Given results of demo and other tests, I recommend using limited search in all cases
sPayload = {'fre':'on', 'w':sourceWord, 'submit':'select', 'c':'all'}
tPayload = {'eng':'on', 'w':sourceWord, 'submit':'select', 'c':'all'}
if lang == 'eng':
URL = sURL
payload = sPayload
else:
URL = tURL
payload = tPayload
page = requests.post(URL, data=payload)
#parse page to get words and frequencies
tree = html.fromstring(page.content)
rawFreq = tree.xpath('//td[@class="freq"]/a/text()')
words = tree.xpath('//td[@class="trg"]/a/text()')
intFreq = [int(str(i)) for i in list(rawFreq)]
candidateDict = {}
for i in range(0, len(words)):
if words[i] not in list(candidateDict.keys()):
candidateDict[str(words[i]).lower()] = 0
candidateDict[str(words[i]).lower()] += intFreq[i]
lowerWords = []
for word in words:
lowerWords.append(word.lower())
words = set(lowerWords)
normFreq = [0 for i in range(0, len(lowerWords))]
for i in range(0, len(lowerWords)):
normFreq[i] = candidateDict[lowerWords[i]]/sum(candidateDict.values())
linkDict[sourceWord] = dict(zip(words, normFreq))
#remove all words that already exist in the graph
for word in linkDict.keys():
linkDict[sourceWord].pop(word, None)
def sameLangTranslations(sourceWord, lattice, wordDict):
for mirroredWord in lattice[sourceWord]:
weight = lattice[sourceWord].get(mirroredWord, 0)
if not weight == 0 and mirroredWord in lattice:
for synonym in lattice[mirroredWord]:
wordDict[synonym] += weight * lattice[mirroredWord][synonym]
def translationPairsToLattice(allWords, wordLinks, sourceIterations):
wordDicts = {}
for eachWord in allWords:
wordDicts[eachWord] = {}
#add all the words to each dictionary
for everyWord in allWords:
wordDicts[eachWord][everyWord] = 0
#add in the weights of the first translations to each word
if eachWord in wordLinks:
for eachKey in wordLinks[eachWord]:
wordDicts[eachWord][eachKey] = wordLinks[eachWord][eachKey]
#Starting from the bottom of the tree, connect every word to its translations
#then connect the word to its children's translations, weighted by its connection to its children
for eachIter in sourceIterations:
for eachWord in eachIter:
sameLangTranslations(eachWord, wordLinks, wordDicts[eachWord])
for everyWord in wordDicts[eachWord]:
if everyWord not in wordLinks[eachWord]:
weight = wordDicts[eachWord].get(everyWord, 0)
if not weight == 0 and everyWord in wordDicts and eachWord != everyWord:
for eachConnection in wordDicts[everyWord]:
#checks that key exists, and skips if not
if eachConnection in wordDicts[eachWord] and eachConnection in wordDicts[everyWord]:
wordDicts[eachWord][eachConnection] += weight * wordDicts[everyWord][eachConnection]
return wordDicts
def constructMasterWordList(targetIterations, sourceIterations):
#Create a master list of all words in the order that they will have in the tensorFlow input
allWords = []
for wordIter in reversed(sourceIterations):
for word in wordIter:
if word not in allWords:
allWords.append(word)
for wordIter in reversed(targetIterations):
for word in wordIter:
if word not in allWords:
allWords.append(word)
return allWords
def genDataframe(sourceIterations, wordDicts, allWords):
#convert dictionaries into a single data table
dataTable = []
rowNames = []
for eachIter in reversed(sourceIterations):
for eachWord in eachIter:
rowNames.append(eachWord)
newRow = []
for everyWord in allWords:
newRow.append(wordDicts[eachWord][everyWord])
dataTable.append(newRow)
wordData = pd.DataFrame(dataTable)
wordData.columns = reversed(allWords)
wordData.index = rowNames
return wordData
def PCA(wordData, numCols, sourceWord):
#reduce the dimensionality of the dataset to n x 30
#this is to ensure consistent column meanings for the neural net
#Credit to Sebastian Raschka's plotly tutorial for significant contributions to the PCA code section
#Original tutorial can be found at https://plot.ly/ipython-notebooks/principal-component-analysis/#PCA-Vs.-LDA
covariance = wordData.cov()
eigenVals, eigenVects = np.linalg.eigh(covariance.values.real)
eigenPairs = [(np.abs(eigenVals[i]), eigenVects[:,i]) for i in range(len(eigenVals))]
eigenPairs = sorted(eigenPairs, key=lambda eigenPairs: eigenPairs[0])
eigenPairs.reverse()
transformation = np.array([i[1] for i in eigenPairs[0:10]])
transformation = np.transpose(transformation)
preparedData = wordData.dot(transformation)
#Save metrics on explained variance as a sorted list of pairs (explained variance of each vector, total explained)
totalVariance = sum(eigenVals)
varianceExplained = [(i / totalVariance)*100 for i in sorted(eigenVals, reverse=True)]
cumulativeVarianceExplained = np.cumsum(varianceExplained)
pd.DataFrame([varianceExplained, cumulativeVarianceExplained.tolist()]).to_csv(sourceWord + 'PCA_dat.csv')
return preparedData
def generateWordData(sourceWord, sLangLayers):
lang = 'eng'
#dictionary of dictionaries, connects words to their translations and lists weights
wordLinks = {}
#lists holding what words were added at each iteration, grouped by language
sourceIterations = []
targetIterations = []
#number of iterations needed to generage desired depth, minus 1 because the initial word is a layer
#layers is then doubled because for each layer created in source lang. a target lang layer must also be created
iterations = ( sLangLayers - 1 ) * 2
#Add the list of iteration 0 words to the source list
sourceIterations.append([sourceWord.lower()])
#generate the first set of translations
genTargetDict(sourceWord, wordLinks, lang)
#Mirror for the first time (target treated as source, source treated as target)
nextSources = list(list(wordLinks.values())[0].keys())
#limit iterations of mirroring to the user defined value
for iterLimit in range(0,iterations):
print('.')
#toggle language selection with each iteration, lang is the current source language
if lang == 'eng':
#if the last source was english, the nextSources will be french
targetIterations.insert(0, list(set(nextSources)))
lang = 'fre'
else:
#if the last source was not english, the nextSources will be english
sourceIterations.insert(0, list(set(nextSources)))
lang = 'eng'
while nextSources:
#no candidate sources yet for this iteration
candidateSources = []
#for each source, find all translations
#add all translations to candidateSources
for source in nextSources:
src = str(source)
genTargetDict(src, wordLinks, lang)
#add all keys(translations) to the candidate list
for word in list(wordLinks[src]):
candidateSources.append(word)
nextSources = []
nextSources = candidateSources
#record the last set of targets after the mirroring finishes
targetIterations.insert(0, nextSources)
allWords = constructMasterWordList(targetIterations, sourceIterations)
#Build dictionaries for each word that extend past initial translations
wordDicts = translationPairsToLattice(allWords, wordLinks, sourceIterations)
#Make all the connections bi-directional (each child connects back to its parent)
for words in reversed(sourceIterations):
for parent in words:
for child in wordDicts[parent]:
#skip if the child has no entry
if child in wordDicts:
wordDicts[child][parent] = wordDicts[parent][child]
with open("latticeOutput.json", "w") as output:
json.dump(wordDicts, output)
#normalize the dataset
for eachDict in wordDicts.values():
for eachWord in eachDict:
if not sum(list(eachDict.values())) == 0:
eachDict[eachWord] = eachDict[eachWord]/sum(list(eachDict.values()))
#make all words connect to themselves
for eachWord in wordDicts:
wordDicts[eachWord][eachWord] = 1
wordData = genDataframe(sourceIterations, wordDicts, allWords)
#print(wordDicts['honey'])
finalCols = 16
preparedData = PCA(wordData, finalCols, sourceWord)
#with open('scrapeDict.csv', 'wb') as csv_file:
# writer = csv.DictWriter(csv_file, preparedData.keys())
# writer.writeheader()
# writer.writerow(preparedData)
return preparedData
#plTools.set_credentials_file(username='JackHouk', api_key='dEl1WMGPvkeClnayYxJz')
def main():
#List of words used to train the neural net.
sourceWords = ["honey", "child", "sweet", "dear", "baby", "run", "operate", "guide", "flee", "calculate",
"big", "important", "meaningful", "exciting", "major", "joy", "happiness", "smile", "fun", "cool",
"hot", "cold", "young", "old", "fish",
"spy", "clean", "destroy", "hide", "show"]
for sourceWord in sourceWords:
jsonItem = [sourceWord]
print(sourceWord)
with open("visualization/scraperOutput.json", "w") as output:
json.dump(jsonItem, output)
sLangLayers = 2
arrayOfWords = generateWordData(sourceWord, sLangLayers)
arrayOfWords.to_csv(sourceWord + "_values.csv")
#np.save('training_honey', generateWordData())
#np.save('test_wood', generateWordData())
if __name__ == '__main__':
main()