diff --git a/CMakeLists.txt b/CMakeLists.txt index d8fb8e547..2ecd32c9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.8.6") +set(SHERPA_ONNX_VERSION "1.8.7") # Disable warning about # diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index 26b215b43..b74892531 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -133,6 +133,8 @@ std::vector Lexicon::ConvertTextToTokenIds( return ConvertTextToTokenIdsGerman(text); case Language::kSpanish: return ConvertTextToTokenIdsSpanish(text); + case Language::kFrench: + return ConvertTextToTokenIdsFrench(text); case Language::kChinese: return ConvertTextToTokenIdsChinese(text); default: @@ -254,6 +256,8 @@ void Lexicon::InitLanguage(const std::string &_lang) { language_ = Language::kGerman; } else if (lang == "spanish") { language_ = Language::kSpanish; + } else if (lang == "french") { + language_ = Language::kFrench; } else if (lang == "chinese") { language_ = Language::kChinese; } else { diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index ecbf26513..af993712a 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -46,6 +46,11 @@ class Lexicon { return ConvertTextToTokenIdsEnglish(text); } + std::vector ConvertTextToTokenIdsFrench( + const std::string &text) const { + return ConvertTextToTokenIdsEnglish(text); + } + std::vector ConvertTextToTokenIdsEnglish( const std::string &text) const; @@ -62,6 +67,7 @@ class Lexicon { kEnglish, kGerman, kSpanish, + kFrench, kChinese, kUnknown, }; diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index e3196b6ab..c01c31b3f 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, std::vector *out); static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } -static bool IsGermanUmlauts(const std::string &word) { +static bool IsGermanUmlaut(const std::string &word) { // ä 0xC3 0xA4 // ö 0xC3 0xB6 // ü 0xC3 0xBC @@ -187,6 +187,7 @@ static bool IsGermanUmlauts(const std::string &word) { } // see https://www.tandem.net/blog/spanish-accents +// https://www.compart.com/en/unicode/U+00DC static bool IsSpanishDiacritic(const std::string &word) { // á 0xC3 0xA1 // é 0xC3 0xA9 @@ -195,6 +196,16 @@ static bool IsSpanishDiacritic(const std::string &word) { // ú 0xC3 0xBA // ü 0xC3 0xBC // ñ 0xC3 0xB1 + // + // uppercase + // + // Á 0xC3 0x81 + // É 0xC3 0x89 + // Í 0xC3 0x8D + // Ó 0xC3 0x93 + // Ú 0xC3 0x9A + // Ü 0xC3 0x9C + // Ñ 0xC3 0x91 if (word.size() != 2 || static_cast(word[0]) != 0xc3) { return false; @@ -202,15 +213,86 @@ static bool IsSpanishDiacritic(const std::string &word) { auto c = static_cast(word[1]); if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba || - c == 0xbc || c == 0xb1) { + c == 0xbc || c == 0xb1 || c == 0x81 || c == 0x89 || c == 0x8d || + c == 0x93 || c == 0x9a || c == 0x9c || c == 0x91) { return true; } return false; } +// see https://www.busuu.com/en/french/accent-marks +static bool IsFrenchDiacritic(const std::string &word) { + // acute accent + // é 0xC3 0xA9 + // + // grave accent + // à 0xC3 0xA0 + // è 0xC3 0xA8 + // ù 0xC3 0xB9 + // + // cedilla + // ç 0xC3 0xA7 + // + // circumflex + // â 0xC3 0xA2 + // ê 0xC3 0xAA + // î 0xC3 0xAE + // ô 0xC3 0xB4 + // û 0xC3 0xBB + // + // trema + // ë 0xC3 0xAB + // ï 0xC3 0xAF + // ü 0xC3 0xBC + // + // É 0xC3 0x89 + // + // À 0xC3 0x80 + // È 0xC3 0x88 + // Ù 0xC3 0x99 + // Ç 0xC3 0x87 + // Â 0xC3 0x82 + // Ê 0xC3 0x8A + // Î 0xC3 0x8E + // Ô 0xC3 0x94 + // Û 0xC3 0x9B + // Ë 0xC3 0x8B + // Ï 0xC3 0x8F + // Ü 0xC3 0x9C + + if (word.size() != 2 || static_cast(word[0]) != 0xc3) { + return false; + } + + auto c = static_cast(word[1]); + if (c == 0xa9 || c == 0xa0 || c == 0xa8 || c == 0xb9 || c == 0xa7 || + c == 0xa2 || c == 0xaa || c == 0xae || c == 0xb4 || c == 0xbb || + c == 0xab || c == 0xaf || c == 0xbc || c == 0x89 || c == 0x80 || + c == 0x88 || c == 0x99 || c == 0x87 || c == 0x82 || c == 0x8a || + c == 0x8e || c == 0x94 || c == 0x9b || c == 0x8b || c == 0x8f || + c == 0x9c) { + return true; + } + return false; +} + static bool IsSpecial(const std::string &w) { - return IsGermanUmlauts(w) || IsSpanishDiacritic(w); + bool ans = IsGermanUmlaut(w) || IsSpanishDiacritic(w) || IsFrenchDiacritic(w); + + // for french d’impossible + // ’ 0xE2 0x80 0x99 + bool ans2 = false; + if (w.size() == 3) { + auto c0 = static_cast(w[0]); + auto c1 = static_cast(w[1]); + auto c2 = static_cast(w[2]); + if (c0 == 0xe2 && c1 == 0x80 && c2 == 0x99) { + ans2 = true; + } + } + + return ans || ans2; } static std::vector MergeCharactersIntoWords(