Skip to content

Commit

Permalink
small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Oct 20, 2023
1 parent be770b5 commit 2488baf
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions sherpa-onnx/csrc/lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,21 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
ToLowerCase(&text);

std::vector<std::string> words = SplitUtf8(text);

if (debug_) {
fprintf(stderr, "Input text (lowercase) in string: %s\n", text.c_str());
fprintf(stderr, "Input text in bytes:");
for (uint8_t c : text) {
fprintf(stderr, " %02x", c);
}
fprintf(stderr, "\n");
fprintf(stderr, "After splitting to words:");
for (const auto &w : words) {
fprintf(stderr, " %s", w.c_str());
}
fprintf(stderr, "\n");
}

int32_t blank = token2id_.at(" ");

std::vector<int64_t> ans;
Expand Down

0 comments on commit 2488baf

Please sign in to comment.