Update ontology_manager.py

huu4ontocord · Mar 6, 2022 · 21f7672 · 21f7672
1 parent 6f89b39
commit 21f7672
Showing 1 changed file with 7 additions and 21 deletions.
diff --git a/ontology/ontology_manager.py b/ontology/ontology_manager.py
@@ -65,7 +65,7 @@
                                            os.path.pardir)))
 import default_onto_tags
 from stopwords import stopwords
-
+from cjk import *
 mt5_underscore = "▁"
 trannum = str.maketrans("0123456789", "1111111111")
 
@@ -104,7 +104,7 @@ def __init__(self, target_lang="", data_dir=None, tmp_dir=None, compound_word_st
         """
         if OntologyManager.mt5_tokenizer  is None:
            OntologyManager.mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small", use_fast=True)
-        self.is_cjk = -1 if target_lang == "" else 1 if target_lang in ("zh", "ja", "ko") else 0
+        self.is_cjk = -1 if target_lang == "" else 1 if lang_is_cjk(target_lang) else 0
         self.tag_type = tag_type
         self.target_lang_lexicon = {}
         self.target_lang = target_lang
@@ -335,7 +335,7 @@ def canonical_word(self, word, connector=None, supress_cjk_tokenize=False, do_lo
       if connector is None:
         connector = self.connector
       if self.is_cjk < 0:
-        is_cjk = self.cjk_detect(word)
+        is_cjk = cjk_detect(word)
       else:
         is_cjk = self.is_cjk   
       if not supress_cjk_tokenize and is_cjk:
@@ -559,7 +559,7 @@ def in_ontology(self, word, connector=None, supress_cjk_tokenize=False, check_pe
         if connector is None:
             connector = self.connector
         if self.is_cjk < 0:
-          is_cjk = self.cjk_detect(word)
+          is_cjk = cjk_detect(word)
         else:
           is_cjk = self.is_cjk   
         word, wordArr = self.canonical_word(word, connector, supress_cjk_tokenize, do_lower=False, do_trannum=False)
@@ -641,8 +641,8 @@ def cjk_tokenize_text(self, text, connector=None):
             if not words2:
                 words2.append(word)
                 continue
-            if not self.cjk_detect(word):
-                if not self.cjk_detect(words2[-1]):
+            if not cjk_detect(word):
+                if not cjk_detect(words2[-1]):
                     if words2[-1] in self.strip_chars_set:
                         words2[-1] += " " + word
                     else:
@@ -677,7 +677,7 @@ def detect(self, text, connector=None, supress_cjk_tokenize=False, check_person_
         labels = []
         if connector is None:
             connector = self.connector
-        if not supress_cjk_tokenize and self.cjk_detect(text):
+        if not supress_cjk_tokenize and cjk_detect(text):
             text = self.cjk_tokenize_text(text, connector)
         sent = text.strip().split()
         len_sent = len(sent)
@@ -778,20 +778,6 @@ def tokenize(self, text, connector=None, supress_cjk_tokenize=False, return_dict
       else:
         return text
 
-    def cjk_detect(self, texts):
-        # chinese
-        if re.search("[\u4e00-\u9FFF]", texts):
-            return "zh"
-        # korean
-        if re.search("[\uac00-\ud7a3]", texts):
-            return "ko"
-        # japanese
-        if re.search("[\u3040-\u30ff]", texts):
-            return "ja"
-
-        return None
-
-
 if __name__ == "__main__":
     data_dir = tmp_dir = None
     if "-s" in sys.argv: