forked from bengxy/word2vec_zh
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_pre.py
80 lines (67 loc) · 1.67 KB
/
data_pre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# coding:utf-8
# 分词
import jieba
import os
import re
def load_stopwords():
print('Load stopword_library...')
stopfile = open('stopword.lib')
stopwords_library = set([])
while True:
word = stopfile.readline()
if not word:
break
stopwords_library.add(word)
return stopwords_library
stopwords_library = load_stopwords()
def remove_stopwords(cut_origin=[]):
'''
WARNING >>>:
remove_stopwords() will change the input list as its using the ref to speedup
'''
for i in cut_origin:
if i in stopwords_library:
cut_origin.remove(i)
def preprocess(in_filename, out_filename):
in_file = open(in_filename, 'r')
out_file = open(out_filename,'a')
while True:
line = in_file.readline()
if not line:
break
line = line.strip()
if not line:
continue
#sep by punctuation
#sentences = re.split('《', line)
#sentences = re.split('。|?|!|\.|-|:| |(|)', line)
sentence = line
#for sentence in sentences:
# if not sentence:
# continue
cut_res = jieba.lcut(sentence, cut_all=True)
remove_stopwords(cut_res)
if not cut_res:
continue
new_line = ' '.join(cut_res)+'\n'
out_file.write(new_line)
in_file.close()
out_file.close()
if __name__ == '__main__':
data_dir = '/home/navallo/Documents/DATA/formalCompetition4/News_info_train/'
cut_data_dir = 'cut_data/'
#output_file_name = 'cut.txt'
#output_file = open(output_file_name, 'a')
files = os.listdir(data_dir)
flag = 1
for afile in files:
# if afile[0] == '.' || afile =='readme.md':
# continue
if flag == 10:
flag = 1
else:
flag = flag + 1
continue
print('Preprocessing...:', afile)
preprocess(data_dir+afile, cut_data_dir + 'cut.txt')
#output_file.close()