-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
267 lines (191 loc) · 7.52 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import re
import random
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import networkx as nx
import nltk
from pymorphy2 import MorphAnalyzer
from russian_tagsets import converters
import stopwordsiso as stopwords
from string import punctuation
import matplotlib.pyplot as plt
morph = MorphAnalyzer()
to_ud = converters.converter('opencorpora-int', 'ud20')
stops = stopwords.stopwords("ru")
added_stops = {'весь', 'это', 'наш', 'оно', 'итак', 'т.п', 'т.е', 'мало', 'меньше', 'ещё', 'слишком', 'также',
'ваш', 'б', 'хм', 'который', 'свой', 'не', 'мочь', 'однако', 'очень', 'благодаря', 'кроме', 'вся',
'какие', 'ru', 'en', 'млрд', 'млн', 'нет', 'этот', 'мной', 'дело', 'был', 'долго', 'наша', 'самих',
'миллионов', 'самых', 'ост', 'ст', 'д', 'проспект', 'компания', 'компании', 'компанию', 'компанией',
'компаниям', 'e-mail', 'шаг', 'ул', 'rus', 'eng', 'проезд', 'площадь', 'cookies', 'куки', 'кг', 'xl',
'rss', 'amp', ';amp', 'pdf', 'doc', 'txt', 'docx', 'i', 'id',
'бывший'}
stops = stops.union(added_stops)
punct = punctuation + '«»—…“”*№–'
model_path = '/home/mitya/PycharmProjects/nlp_graph/model.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)
def pymorphy_tagger(text):
text = text.replace('[', ' ').replace(']', ' ')
parsed = []
tokens = nltk.word_tokenize(text)
for word in tokens:
word = word.strip(punct)
if (word not in stops) and (word not in punct) and (
re.sub(r'[{}]+'.format(punct), '', word).isdigit() is False) and (word != 'nan'):
lemma = str(morph.parse(word)[0].normal_form)
pos = to_ud(str(morph.parse(word)[0].tag.POS)).split()[0]
word_with_tag = lemma + '_' + pos
parsed.append(word_with_tag)
return ' '.join(parsed)
def cosine(a, b):
dot = np.dot(a, b.T)
norma = np.linalg.norm(a)
normb = np.linalg.norm(b)
cos = dot / (norma * normb)
return cos
def cos_sim(a, b):
return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
def similar_words(text, n):
"""return n most similar words in models dictionary"""
lst = model.most_similar(pymorphy_tagger(text), topn=n)
return lst
def diff(text1, text2):
return cosine(model[text1], model[text2])
def vertices(text, n):
"""return list of vertices based on similar_words function"""
vertices = similar_words(text, n)
vertices_list = [vertices[0] for vertices in vertices]
return vertices_list
def adjacency_mat(vertices_list):
"""make matrix of distances between words"""
n = len(vertices_list)
adj_mat = []
for i in vertices_list:
for j in vertices_list:
adj_vec = []
vec = cosine(vectorize_word(i), vectorize_word(j))
adj_vec.append(vec)
adj_mat.append(adj_vec)
return np.array(adj_mat).reshape(n, n)
def make_graph(mat, vertices_list, th):
"""make graph with edges between vertices based on adjacency_mat function"""
G = nx.from_numpy_matrix(mat)
mapping = dict(zip(G, vertices_list))
H = nx.relabel_nodes(G, mapping)
labels = nx.get_edge_attributes(H, 'weight')
labels_filtered = dict()
for (key, value) in labels.items():
if value <= th:
labels_filtered[key] = value
else:
pass
e = getList(labels_filtered)
for element in e:
H.remove_edge(*element)
return H
def draw_graph(graph, node_size, alpha, show_weights=False):
"""draw graph: specify size of nodes and transparency of edges"""
labels = nx.get_edge_attributes(graph, 'weight')
pos = nx.spring_layout(graph)
plt.figure()
nx.draw(graph, pos, edge_color='black', width=1, linewidths=1,
node_size=node_size, node_color='pink', alpha=alpha,
labels={node: node for node in graph.nodes()})
if show_weights:
nx.draw_networkx_edge_labels(graph, pos=pos, edge_labels=labels)
else:
pass
plt.show()
def text2graph(text, th, raw=True):
"""full pipeline to make graph from text"""
if raw:
text_tagged = pymorphy_tagger(clean_numbers(text))
text_tagged = text_tagged.split()
else:
text_tagged = list(text)
text_mat = adjacency_mat(text_tagged)
graph = make_graph(text_mat, text_tagged, th)
return graph
def getList(dct):
list = []
for key in dct.keys():
list.append(key)
return list
def clean_numbers(text):
text = re.sub(r'[0-9]+', '', text)
return text
def vectorize_word(word):
"""vectorize word with unknown word handler"""
try:
vec = model[word]
except KeyError:
vec = np.zeros(300)
return vec
def metric_filtration(text_mat, text_tagged):
"""filtration by metric value"""
average_clustering = {}
for i in np.arange(0.1, 0.9, 0.1):
graph = make_graph(text_mat, text_tagged, i)
average_clustering['threshold_{}'.format(i)] = nx.average_clustering(graph)
return average_clustering
def draw_filtration_metric(average_clustering):
"""draw graph of metric filtration"""
plt.bar(range(len(average_clustering)), list(average_clustering.values()), align='center')
plt.xticks(range(len(average_clustering)), list(average_clustering.keys()))
def generate_random(lst):
"""generate random equal-length list from POS-tagged text"""
newlist = []
for i in range(len(lst)):
element = random.choice(lst)
newlist.append(element)
return newlist
def select_triangles(lst, n):
"""select n-dimensional cliques"""
triangles = []
for i in lst:
if len(i) == n:
triangles.append(i)
return triangles
def jaccard_similarity(g, h):
"""Jaccard distance between graphs"""
i = set(g).intersection(h)
return round(len(i) / (len(g) + len(h) - len(i)),3)
def average_distance(df, topic, column, func):
"""
computes average distance by the chosen metric between text-graphs of certain topic.
arguments:
df -- dataframe to analyze
topic -- class of texts to choose
column -- column to analyze
func -- metric to use
"""
df_topic = df[df['topic'] == topic]
texts = df_topic[column].tolist()
list_graphs = []
for i in range(len(texts)):
graph = text2graph(texts[i], 0.1)
list_graphs.append(graph)
list_dist = []
for i in list_graphs:
for j in list_graphs:
dist = func(i, j)
list_dist.append(dist)
return round((sum(list_dist) / len(list_dist)), 3)
def preprocess_query(text, func, th):
"""
preprocess query: convert to graph and then compute specific graph metric
arguments:
text -- text to preprocess
func -- function to compute graph characteristics
th -- threshold for text2graph algorithm
"""
return func(text2graph(text, th))
def find_closest(query, func, df, th):
"""find closest text in terms of specified metric"""
query_preproc = preprocess_query(query, func, th)
df['result'] = df['dist'].apply(lambda x: abs(query_preproc - x))
return df.sort_values('result')['text']
#TODO
# split to several thematic scripts
# next procedure for decomposition -> max-clique of max clique and so on
# try geodesic distance between vectors
# https://arxiv.org/pdf/1810.10136.pdf