This repository has been archived by the owner on Mar 19, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanfiles.py
86 lines (70 loc) · 2.6 KB
/
cleanfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import argparse
import collections
import concurrent.futures
import os
import re
import hashlib
import html2text
# durty hack with global variable. may be not safe and no guarantee
seen_hashes = collections.defaultdict(list)
def md5(text: str):
h = hashlib.md5()
h.update(text.encode("utf8"))
return h.hexdigest()
def preprocess_text(text):
# windows like
text = re.sub(r"\r\n", "\n", text)
# durty hack to replace \n with <br> to fool html2text
text = re.sub(r"\n", "<br>", text)
return text
def process_file(src, dst):
with open(src, 'r') as src_file:
content = src_file.read()
# no need to process empty content
if len(content) ==0 or content.isspace():
return f"{src} -X {dst}"
content = preprocess_text(content)
text = html2text.html2text(content, bodywidth=0)
# remove spaces at the beginning and the end
text = text.strip()
# clean strange symbols
text = text.encode('utf8','replace').decode('utf8','replace')
# no need to save empty text
if len(text) ==0 or text.isspace():
return f"{src} XX {dst}"
# check for seen hashes
h = md5(text)
if h in seen_hashes:
return f"{src} -Y {dst}: {seen_hashes[h]}"
seen_hashes[h].append(src)
with open(dst, "w", encoding="utf8") as dst_file:
dst_file.write(text)
return f"{src} -> {dst}"
def get_files(src_dir, dst_dir):
for root, _, files in os.walk(src_dir):
for name in files:
if name.endswith(".txt"):
src_file = os.path.join(root, name)
dst_file = os.path.join(dst_dir, name)
yield src_file, dst_file
def main():
# arguments
parser = argparse.ArgumentParser(description="extract text from files")
parser.add_argument("src", type=str, help="src dir")
parser.add_argument("dst", type=str, help="dst dir")
args = parser.parse_args()
os.makedirs(args.dst, exist_ok=True) # need to make sure dst exists
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {}
for src_file, dst_file in get_files(args.src, args.dst):
futures[executor.submit(process_file, src_file, dst_file)] = (src_file, dst_file)
for future in concurrent.futures.as_completed(futures):
try:
rst = future.result()
print(rst)
except UnicodeEncodeError as ex:
print(f"got exception processing {futures[future][0]}")
print(ex)
raise ex
if __name__ == "__main__":
main()