-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2021-2-24-biology-clustering.py
161 lines (147 loc) · 5.31 KB
/
2021-2-24-biology-clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# 关键词词共现
# -*- coding: utf-8 -*-
"""
@File : text_library_mysql2mongo.py
@Author : Yuka
@Time : 2021/1/7 14:53
@Version : 1.0.0
@Description:
"""
import numpy as np
import pyperclip
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from textrank4zh import TextRank4Sentence
import json
import xlsxwriter
import mysql.connector.pooling
import pymongo
from bert_serving.client import BertClient
import numpy as np
import matplotlib.pyplot as plt
import xlrd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import font_manager
def sort_dict_by_value(d, reverse=True):
return dict(sorted(d.items(), key=lambda item: item[1], reverse=reverse))
# file_3 = 'D:/monetwaredata/biology_co.xlsx'
# workbook_3 = xlsxwriter.Workbook(file_3)
# worksheet_3 = workbook_3.add_worksheet('Sheet1')
# word_art_file = "D:/monetwaredata/wordart.xlsx"
# workbook = xlsxwriter.Workbook(word_art_file)
# worksheet = workbook.add_worksheet('Sheet1')
# client = pymongo.MongoClient("mongodb://192.168.1.142:27017/admin?connectTimeoutMS=10000&authSource=admin")
# db = client["biology"]
# collection = db["biology_data"]
# word_cloud_dictionary = {}
# for x in collection.find():
# if "keyword" in x.keys():
# keywords = x["keyword"]
# for word in keywords:
# if word in word_cloud_dictionary:
# word_cloud_dictionary[word] += 1
# else:
# word_cloud_dictionary[word] = 1
#
# final_result = sort_dict_by_value(word_cloud_dictionary)
# print(len(final_result.keys()))
# keys_list = list(final_result.keys())
# for i in range(len(keys_list)):
# print(i)
# worksheet.write(i, 0, keys_list[i])
# worksheet.write(i, 1, final_result[keys_list[i]])
#
# workbook.close()
# print(final_result)
# labels = []
# for i in range(0, 500):
# labels.append(keys_list[i])
#
# bc = BertClient("wx.ringdata.net", port=15555, port_out=15556)
#
# for label in labels:
# if label != '':
# vector = bc.encode([label])
# print(vector)
wb = xlrd.open_workbook('D:/monetwaredata/biology_co_en.xlsx')
sh = wb.sheet_by_name('Sheet1')
my_font = font_manager.FontProperties(fname="C:/Windows/Fonts/simsun.ttc")
final_result = []
for i in range(1, 201):
if sh.cell(i, 0).value != "\xa0":
final_result.append(sh.cell(i, 0).value)
bc = BertClient("wx.ringdata.net", port=15555, port_out=15556)
vector = bc.encode(final_result)
print(vector)
print(len(vector))
sparse_vector = TSNE(n_components=2, learning_rate=200).fit_transform(vector)
colors_all = [
'#37A2DA', '#e06343', '#37a354', '#b55dba', '#b5bd48', '#8378EA', '#96BFFF',
'#6F4242', '#FF00FF', '#97694F', '#6B8E23', '#BC1717', '#00FFFF', '#7093DB', '#EAEAAE', '#238E68',
'#70DB93', '#855E42', '#9370DB', '#6B4226', '#545454', '#426F42', '#8E6B23', '#856363'
]
# vector = []
# for i in range(1, 201):
# word_vector = []
# for j in range(1, 201):
# word_vector.append(sh.cell(i, j).value)
# vector.append(word_vector)
# sparse_vector = TSNE(n_components=2, learning_rate=200).fit_transform(vector)
# for label in labels:
# if label != '':
# vector = bc.encode([label])
# print(vector)
# vectors = TSNE(n_components=3, learning_rate=200).fit_transform(final_result)
# print(vectors)
# x = []
# y = []
# z = []
# for vector in vectors:
# x.append(vector[0])
# y.append(vector[1])
# z.append(vector[2])
# ax1 = plt.axes(projection='3d')
# ax1.plot3D(x, y, z, '*')
# text = ['疫情中的众生相', '防控部署', '医疗物资保障与基础设施建设', '疫情中的经济', '疫情中的文化传播', '疫情中的民生', '新冠肺炎医治', '疫情中的国际社会', '新冠疫情动态', '疫情中的法制']
# for i in range(len(x)):
# ax1.text(x[i], y[i], z[i], text[i], fontsize=12, fontproperties=my_font)
# plt.show()
def k_means_cluster(text_vectors, cluster_number):
model = KMeans(n_clusters=cluster_number)
model.fit(text_vectors)
yhat = model.predict(text_vectors)
return yhat.tolist()
cluster_number = 20
cluster_result = []
result = k_means_cluster(vector, cluster_number)
for i in range(cluster_number):
category = []
for j in range(len(result)):
if result[j] == i:
category.append(final_result[j])
cluster_result.append(category)
for cluster in cluster_result:
print(cluster)
sparse_cluster_result = []
result_sparse = k_means_cluster(sparse_vector, cluster_number)
for i in range(cluster_number):
category_sparse = []
for j in range(len(result_sparse)):
if result_sparse[j] == i:
category_sparse.append(final_result[j])
sparse_cluster_result.append(category_sparse)
echarts_result = []
# {value:[10.0, 8.04],name:"dsada",label:{show:true, formatter:"好"},itemStyle:{color:"#00FF7F"}},
for i in range(len(final_result)):
echarts_json = {"name": final_result[i], "value": sparse_vector.tolist()[i],
"label": {"show": "true", "formatter": final_result[i]}}
for j in range(len(cluster_result)):
if final_result[i] in cluster_result[j]:
echarts_json["itemStyle"] = {"color": colors_all[j]}
echarts_result.append(echarts_json)
pyperclip.copy(json.dumps(echarts_result, ensure_ascii=False))
print("画图结果为")
print(echarts_result)