Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix hotwords OOV log #1139

Merged
merged 1 commit into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions sherpa-onnx/csrc/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines,
break;
default:
SHERPA_ONNX_LOGE(
"Cannot find ID for token %s at line: %s. (Hint: words on "
"the same line are separated by spaces)",
word.c_str(), line.c_str());
"Cannot find ID for token %s at line: %s. (Hint: Check the "
"tokens.txt see if %s in it)",
word.c_str(), line.c_str(), word.c_str());
has_oov = true;
break;
}
Expand Down
20 changes: 12 additions & 8 deletions sherpa-onnx/python/sherpa_onnx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import List, Optional, Union


def text2token(
texts: List[str],
tokens: str,
Expand Down Expand Up @@ -33,20 +34,20 @@ def text2token(
is True, or it is a list of list of tokens.
"""
try:
import sentencepiece as spm
import sentencepiece as spm
except ImportError:
print('Please run')
print(' pip install sentencepiece')
print('before you continue')
print("Please run")
print(" pip install sentencepiece")
print("before you continue")
raise

try:
from pypinyin import pinyin
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
except ImportError:
print('Please run')
print(' pip install pypinyin')
print('before you continue')
print("Please run")
print(" pip install pypinyin")
print("before you continue")
raise

assert Path(tokens).is_file(), f"File not exists, {tokens}"
Expand Down Expand Up @@ -119,7 +120,10 @@ def text2token(
if txt in tokens_table:
text_list.append(tokens_table[txt] if output_ids else txt)
else:
print(f"OOV token : {txt}, skipping text : {text}.")
print(
f"Can't find token {txt} in token table, check your "
f"tokens.txt see if {txt} in it. skipping text : {text}."
)
contain_oov = True
break
if contain_oov:
Expand Down
Loading