diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index 7014348c3..8dafe7bfe 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -150,6 +150,21 @@ std::vector Lexicon::ConvertTextToTokenIdsEnglish( ToLowerCase(&text); std::vector words = SplitUtf8(text); + + if (debug_) { + fprintf(stderr, "Input text (lowercase) in string: %s\n", text.c_str()); + fprintf(stderr, "Input text in bytes:"); + for (uint8_t c : text) { + fprintf(stderr, " %02x", c); + } + fprintf(stderr, "\n"); + fprintf(stderr, "After splitting to words:"); + for (const auto &w : words) { + fprintf(stderr, " %s", w.c_str()); + } + fprintf(stderr, "\n"); + } + int32_t blank = token2id_.at(" "); std::vector ans;