From 2488baf27f828719a1cc727fe3d08091d73ca8af Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 20 Oct 2023 12:04:09 +0800 Subject: [PATCH] small fixes --- sherpa-onnx/csrc/lexicon.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index 7014348c3..8dafe7bfe 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -150,6 +150,21 @@ std::vector Lexicon::ConvertTextToTokenIdsEnglish( ToLowerCase(&text); std::vector words = SplitUtf8(text); + + if (debug_) { + fprintf(stderr, "Input text (lowercase) in string: %s\n", text.c_str()); + fprintf(stderr, "Input text in bytes:"); + for (uint8_t c : text) { + fprintf(stderr, " %02x", c); + } + fprintf(stderr, "\n"); + fprintf(stderr, "After splitting to words:"); + for (const auto &w : words) { + fprintf(stderr, " %s", w.c_str()); + } + fprintf(stderr, "\n"); + } + int32_t blank = token2id_.at(" "); std::vector ans;