Skip to content

Commit

Permalink
Update ontology_manager.py
Browse files Browse the repository at this point in the history
  • Loading branch information
huu4ontocord authored Mar 6, 2022
1 parent 6f89b39 commit 21f7672
Showing 1 changed file with 7 additions and 21 deletions.
28 changes: 7 additions & 21 deletions ontology/ontology_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
os.path.pardir)))
import default_onto_tags
from stopwords import stopwords

from cjk import *
mt5_underscore = "▁"
trannum = str.maketrans("0123456789", "1111111111")

Expand Down Expand Up @@ -104,7 +104,7 @@ def __init__(self, target_lang="", data_dir=None, tmp_dir=None, compound_word_st
"""
if OntologyManager.mt5_tokenizer is None:
OntologyManager.mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small", use_fast=True)
self.is_cjk = -1 if target_lang == "" else 1 if target_lang in ("zh", "ja", "ko") else 0
self.is_cjk = -1 if target_lang == "" else 1 if lang_is_cjk(target_lang) else 0
self.tag_type = tag_type
self.target_lang_lexicon = {}
self.target_lang = target_lang
Expand Down Expand Up @@ -335,7 +335,7 @@ def canonical_word(self, word, connector=None, supress_cjk_tokenize=False, do_lo
if connector is None:
connector = self.connector
if self.is_cjk < 0:
is_cjk = self.cjk_detect(word)
is_cjk = cjk_detect(word)
else:
is_cjk = self.is_cjk
if not supress_cjk_tokenize and is_cjk:
Expand Down Expand Up @@ -559,7 +559,7 @@ def in_ontology(self, word, connector=None, supress_cjk_tokenize=False, check_pe
if connector is None:
connector = self.connector
if self.is_cjk < 0:
is_cjk = self.cjk_detect(word)
is_cjk = cjk_detect(word)
else:
is_cjk = self.is_cjk
word, wordArr = self.canonical_word(word, connector, supress_cjk_tokenize, do_lower=False, do_trannum=False)
Expand Down Expand Up @@ -641,8 +641,8 @@ def cjk_tokenize_text(self, text, connector=None):
if not words2:
words2.append(word)
continue
if not self.cjk_detect(word):
if not self.cjk_detect(words2[-1]):
if not cjk_detect(word):
if not cjk_detect(words2[-1]):
if words2[-1] in self.strip_chars_set:
words2[-1] += " " + word
else:
Expand Down Expand Up @@ -677,7 +677,7 @@ def detect(self, text, connector=None, supress_cjk_tokenize=False, check_person_
labels = []
if connector is None:
connector = self.connector
if not supress_cjk_tokenize and self.cjk_detect(text):
if not supress_cjk_tokenize and cjk_detect(text):
text = self.cjk_tokenize_text(text, connector)
sent = text.strip().split()
len_sent = len(sent)
Expand Down Expand Up @@ -778,20 +778,6 @@ def tokenize(self, text, connector=None, supress_cjk_tokenize=False, return_dict
else:
return text

def cjk_detect(self, texts):
# chinese
if re.search("[\u4e00-\u9FFF]", texts):
return "zh"
# korean
if re.search("[\uac00-\ud7a3]", texts):
return "ko"
# japanese
if re.search("[\u3040-\u30ff]", texts):
return "ja"

return None


if __name__ == "__main__":
data_dir = tmp_dir = None
if "-s" in sys.argv:
Expand Down

0 comments on commit 21f7672

Please sign in to comment.