This repository has been archived by the owner on Mar 19, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
68 lines (56 loc) · 2.63 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import unicodedata
import html2text
# Prepare translation table to clean some non-important symbols.
# See http://www.unicode.org/reports/tr44/#General_Category_Values for unicode categories
exclude_unicode_set = {
"Cc", # Control: a C0 or C1 control code
"Cf", # Format: a format control character
"Cs", # Surrogate: a surrogate code point
"Co", # Private_Use: a private-use character
"Cn", # Unassigned: a reserved unassigned code point or a noncharacter
"Zl", # Line_Separator: U+2028 LINE SEPARATOR only
"Zp", # Paragraph_Separator: U+2029 PARAGRAPH SEPARATOR only
"Sm", # Math_Symbol: a symbol of mathematical use
"Sc", # Currency_Symbol: a currency sign
"Sk", # Modifier_Symbol: a non-letterlike modifier symbol
"So", # Other_Symbol: a symbol of other type
"Pc", # Connector_Punctuation: a connecting punctuation mark, like a tie
"Pd", # Dash_Punctuation: a dash or hyphen punctuation mark
"Ps", # Open_Punctuation: an opening punctuation mark (of a pair)
"Pe", # Close_Punctuation: a closing punctuation mark (of a pair)
"Pi", # Initial_Punctuation: an initial quotation mark
"Pf", # Final_Punctuation: a final quotation mark
"Po", # Other_Punctuation: a punctuation mark of other type
}
include_symbols = '@./?' # "@" and "." for emails; "/" and "?" for urls (only path) glue
all_chars = set(chr(i) for i in range(sys.maxunicode))
exclude_chars = set(c for c in all_chars if unicodedata.category(c) in exclude_unicode_set)
translate_table = {ord(character): " " for character in exclude_chars}
# replace "@" and "." and others by ``None`` to glue emails and urls
translate_table.update({ord(character): None for character in include_symbols})
hex_trans_table = {ord(char): None for char in '0123456789abcdefABCDEF'}
def is_hex_number(s):
"""using hex_trans_table try to identify if string is hex number or not"""
return s.translate(hex_trans_table) == ''
def text_to_words(body):
"""
Extract words from text body.
Remove unprintable characters, numbers, punctuation, etc. returns iterator over words in body.
:param body: text to parse
:return: iterator over words
"""
# lower
text = body.lower()
# remove html at first
text = html2text.html2text(text)
# clean some symbols. for example control
text = text.translate(translate_table)
# split to words
words = text.split()
# filter digits
words = filter(lambda x: not x.isdigit() and not is_hex_number(x), words)
return list(words)