From b6da21e5e996acee137658ad75bbf8cc490e1699 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 17 Oct 2023 17:55:39 +0800 Subject: [PATCH 1/4] support Chinese vits models --- CMakeLists.txt | 2 + cmake/kaldi-decoder.cmake | 2 +- cmake/utfcpp.cmake | 45 +++++++ sherpa-onnx/csrc/CMakeLists.txt | 1 + sherpa-onnx/csrc/lexicon.cc | 145 ++++++++++++++------- sherpa-onnx/csrc/lexicon.h | 26 +++- sherpa-onnx/csrc/offline-tts-vits-impl.h | 2 +- sherpa-onnx/csrc/offline-tts-vits-model.cc | 5 + sherpa-onnx/csrc/offline-tts-vits-model.h | 1 + sherpa-onnx/csrc/text-utils.cc | 56 ++++++++ sherpa-onnx/csrc/text-utils.h | 2 + sherpa-onnx/csrc/utfcpp-test.cc | 21 +++ 12 files changed, 259 insertions(+), 49 deletions(-) create mode 100644 cmake/utfcpp.cmake create mode 100644 sherpa-onnx/csrc/utfcpp-test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 70a014e66..ca6a6276f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,8 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET) include(asio) endif() +include(utfcpp) + add_subdirectory(sherpa-onnx) if(SHERPA_ONNX_ENABLE_C_API) diff --git a/cmake/kaldi-decoder.cmake b/cmake/kaldi-decoder.cmake index ac2482bd0..9f85d70a7 100644 --- a/cmake/kaldi-decoder.cmake +++ b/cmake/kaldi-decoder.cmake @@ -6,7 +6,7 @@ function(download_kaldi_decoder) set(kaldi_decoder_HASH "SHA256=98bf445a5b7961ccf3c3522317d900054eaadb6a9cdcf4531e7d9caece94a56d") set(KALDI_DECODER_BUILD_PYTHON OFF CACHE BOOL "" FORCE) - set(KALDI_DECODER_BUILD_PYTHON OFF CACHE BOOL "" FORCE) + set(KALDI_DECODER_ENABLE_TESTS OFF CACHE BOOL "" FORCE) set(KALDIFST_BUILD_PYTHON OFF CACHE BOOL "" FORCE) # If you don't have access to the Internet, diff --git a/cmake/utfcpp.cmake b/cmake/utfcpp.cmake new file mode 100644 index 000000000..1dc724374 --- /dev/null +++ b/cmake/utfcpp.cmake @@ -0,0 +1,45 @@ +function(download_utfcpp) + include(FetchContent) + + set(utfcpp_URL "https://github.com/nemtrif/utfcpp/archive/refs/tags/v3.2.5.tar.gz") + set(utfcpp_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/utfcpp-3.2.5.tar.gz") + set(utfcpp_HASH "SHA256=14fd1b3c466814cb4c40771b7f207b61d2c7a0aa6a5e620ca05c00df27f25afd") + + # If you don't have access to the Internet, + # please pre-download utfcpp + set(possible_file_locations + $ENV{HOME}/Downloads/utfcpp-3.2.5.tar.gz + ${PROJECT_SOURCE_DIR}/utfcpp-3.2.5.tar.gz + ${PROJECT_BINARY_DIR}/utfcpp-3.2.5.tar.gz + /tmp/utfcpp-3.2.5.tar.gz + /star-fj/fangjun/download/github/utfcpp-3.2.5.tar.gz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(utfcpp_URL "${f}") + file(TO_CMAKE_PATH "${utfcpp_URL}" utfcpp_URL) + message(STATUS "Found local downloaded utfcpp: ${utfcpp_URL}") + set(utfcpp_URL2) + break() + endif() + endforeach() + + FetchContent_Declare(utfcpp + URL + ${utfcpp_URL} + ${utfcpp_URL2} + URL_HASH ${utfcpp_HASH} + ) + + FetchContent_GetProperties(utfcpp) + if(NOT utfcpp_POPULATED) + message(STATUS "Downloading utfcpp from ${utfcpp_URL}") + FetchContent_Populate(utfcpp) + endif() + message(STATUS "utfcpp is downloaded to ${utfcpp_SOURCE_DIR}") + # add_subdirectory(${utfcpp_SOURCE_DIR} ${utfcpp_BINARY_DIR} EXCLUDE_FROM_ALL) + include_directories(${utfcpp_SOURCE_DIR}) +endfunction() + +download_utfcpp() diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 82f53bca6..174a3a394 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -331,6 +331,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) stack-test.cc transpose-test.cc unbind-test.cc + utfcpp-test.cc ) function(sherpa_onnx_add_test source) diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index a2e4af68c..3707f1489 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -76,9 +76,105 @@ static std::vector ConvertTokensToIds( } Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens, - const std::string &punctuations) { + const std::string &punctuations, const std::string &language) { + InitLanguage(language); + InitTokens(tokens); + InitLexicon(lexicon); + InitPunctuations(punctuations); +} + +std::vector Lexicon::ConvertTextToTokenIds( + const std::string &text) const { + switch (language_) { + case Language::kEnglish: + return ConvertTextToTokenIdsEnglish(text); + case Language::kChinese: + return ConvertTextToTokenIdsChinese(text); + default: + SHERPA_ONNX_LOGE("Unknonw language: %d", static_cast(language_)); + exit(-1); + } + + return {}; +} + +std::vector Lexicon::ConvertTextToTokenIdsChinese( + const std::string &text) const { + std::vector words = SplitUtf8(text); + + std::vector ans; + + ans.push_back(token2id_.at("sil")); + + for (const auto &w : words) { + if (!word2ids_.count(w)) { + SHERPA_ONNX_LOGE("OOV %s. Ignore it!", w.c_str()); + continue; + } + + const auto &token_ids = word2ids_.at(w); + ans.insert(ans.end(), token_ids.begin(), token_ids.end()); + } + ans.push_back(token2id_.at("sil")); + ans.push_back(token2id_.at("eos")); + return ans; +} + +std::vector Lexicon::ConvertTextToTokenIdsEnglish( + const std::string &_text) const { + std::string text(_text); + ToLowerCase(&text); + + std::vector words = SplitUtf8(text); + + std::vector ans; + for (const auto &w : words) { + if (punctuations_.count(w)) { + ans.push_back(token2id_.at(w)); + continue; + } + + if (!word2ids_.count(w)) { + SHERPA_ONNX_LOGE("OOV %s. Ignore it!", w.c_str()); + continue; + } + + const auto &token_ids = word2ids_.at(w); + ans.insert(ans.end(), token_ids.begin(), token_ids.end()); + if (blank_ != -1) { + ans.push_back(blank_); + } + } + + if (blank_ != -1 && !ans.empty()) { + // remove the last blank + ans.resize(ans.size() - 1); + } + + return ans; +} + +void Lexicon::InitTokens(const std::string &tokens) { token2id_ = ReadTokens(tokens); - blank_ = token2id_.at(" "); + if (token2id_.count(" ")) { + blank_ = token2id_.at(" "); + } +} + +void Lexicon::InitLanguage(const std::string &_lang) { + std::string lang(_lang); + ToLowerCase(&lang); + if (lang == "english") { + language_ = Language::kEnglish; + } else if (lang == "chinese") { + language_ = Language::kChinese; + } else { + SHERPA_ONNX_LOGE("Unknown language: %s", _lang.c_str()); + exit(-1); + } +} + +void Lexicon::InitLexicon(const std::string &lexicon) { std::ifstream is(lexicon); std::string word; @@ -109,8 +205,9 @@ Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens, } word2ids_.insert({std::move(word), std::move(ids)}); } +} - // process punctuations +void Lexicon::InitPunctuations(const std::string &punctuations) { std::vector punctuation_list; SplitStringToVector(punctuations, " ", false, &punctuation_list); for (auto &s : punctuation_list) { @@ -118,46 +215,4 @@ Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens, } } -std::vector Lexicon::ConvertTextToTokenIds( - const std::string &_text) const { - std::string text(_text); - ToLowerCase(&text); - - std::vector words; - SplitStringToVector(text, " ", false, &words); - - std::vector ans; - for (auto w : words) { - std::vector prefix; - while (!w.empty() && punctuations_.count(std::string(1, w[0]))) { - // if w begins with a punctuation - prefix.push_back(token2id_.at(std::string(1, w[0]))); - w = std::string(w.begin() + 1, w.end()); - } - - std::vector suffix; - while (!w.empty() && punctuations_.count(std::string(1, w.back()))) { - suffix.push_back(token2id_.at(std::string(1, w.back()))); - w = std::string(w.begin(), w.end() - 1); - } - - if (!word2ids_.count(w)) { - SHERPA_ONNX_LOGE("OOV %s. Ignore it!", w.c_str()); - continue; - } - - const auto &token_ids = word2ids_.at(w); - ans.insert(ans.end(), prefix.begin(), prefix.end()); - ans.insert(ans.end(), token_ids.begin(), token_ids.end()); - ans.insert(ans.end(), suffix.rbegin(), suffix.rend()); - ans.push_back(blank_); - } - - if (!ans.empty()) { - ans.resize(ans.size() - 1); - } - - return ans; -} - } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index 73d6c8a8d..74e374ee1 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -13,18 +13,40 @@ namespace sherpa_onnx { +// TODO(fangjun): Refactor it to an abstract class class Lexicon { public: Lexicon(const std::string &lexicon, const std::string &tokens, - const std::string &punctuations); + const std::string &punctuations, const std::string &language); std::vector ConvertTextToTokenIds(const std::string &text) const; + private: + std::vector ConvertTextToTokenIdsEnglish( + const std::string &text) const; + + std::vector ConvertTextToTokenIdsChinese( + const std::string &text) const; + + void InitLanguage(const std::string &lang); + void InitTokens(const std::string &tokens); + void InitLexicon(const std::string &lexicon); + void InitPunctuations(const std::string &punctuations); + + private: + enum class Language { + kEnglish, + kChinese, + kUnknown, + }; + private: std::unordered_map> word2ids_; std::unordered_set punctuations_; std::unordered_map token2id_; - int32_t blank_; // ID for the blank token + int32_t blank_ = -1; // ID for the blank token + Language language_; + // }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 59651ab21..7174e78cd 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -21,7 +21,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config) : model_(std::make_unique(config.model)), lexicon_(config.model.vits.lexicon, config.model.vits.tokens, - model_->Punctuations()) {} + model_->Punctuations(), model_->Language()) {} GeneratedAudio Generate(const std::string &text, int64_t sid = 0) const override { diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.cc b/sherpa-onnx/csrc/offline-tts-vits-model.cc index 2d6792941..06aab516f 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model.cc @@ -84,6 +84,7 @@ class OfflineTtsVitsModel::Impl { bool AddBlank() const { return add_blank_; } std::string Punctuations() const { return punctuations_; } + std::string Language() const { return language_; } private: void Init(void *model_data, size_t model_data_length) { @@ -108,6 +109,7 @@ class OfflineTtsVitsModel::Impl { SHERPA_ONNX_READ_META_DATA(add_blank_, "add_blank"); SHERPA_ONNX_READ_META_DATA(n_speakers_, "n_speakers"); SHERPA_ONNX_READ_META_DATA_STR(punctuations_, "punctuation"); + SHERPA_ONNX_READ_META_DATA_STR(language_, "language"); } private: @@ -128,6 +130,7 @@ class OfflineTtsVitsModel::Impl { int32_t add_blank_; int32_t n_speakers_; std::string punctuations_; + std::string language_; }; OfflineTtsVitsModel::OfflineTtsVitsModel(const OfflineTtsModelConfig &config) @@ -147,4 +150,6 @@ std::string OfflineTtsVitsModel::Punctuations() const { return impl_->Punctuations(); } +std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); } + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.h b/sherpa-onnx/csrc/offline-tts-vits-model.h index de3927f73..0c8208d53 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model.h @@ -38,6 +38,7 @@ class OfflineTtsVitsModel { bool AddBlank() const; std::string Punctuations() const; + std::string Language() const; private: class Impl; diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index f54acc839..052d29006 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -8,12 +8,15 @@ #include #include +#include #include #include #include #include #include +#include "source/utf8.h" + // This file is copied/modified from // https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc @@ -158,4 +161,57 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, bool omit_empty_strings, std::vector *out); +std::vector SplitUtf8(const std::string &text) { + char *begin = const_cast(text.c_str()); + char *end = begin + text.size(); + + std::vector ans; + std::string buf; + + while (begin < end) { + uint32_t code = utf8::next(begin, end); + + // 1. is punctuation + if (std::ispunct(code)) { + if (!buf.empty()) { + ans.push_back(std::move(buf)); + } + + char s[5] = {0}; + utf8::append(code, s); + ans.push_back(s); + continue; + } + + // 2. is space + if (std::isspace(code)) { + if (!buf.empty()) { + ans.push_back(std::move(buf)); + } + continue; + } + + // 3. is alpha + if (std::isalpha(code)) { + buf.push_back(code); + continue; + } + + if (!buf.empty()) { + ans.push_back(std::move(buf)); + } + + // for others + + char s[5] = {0}; + utf8::append(code, s); + ans.push_back(s); + } + + if (!buf.empty()) { + ans.push_back(std::move(buf)); + } + + return ans; +} } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/text-utils.h b/sherpa-onnx/csrc/text-utils.h index 55d3a281e..07251eef9 100644 --- a/sherpa-onnx/csrc/text-utils.h +++ b/sherpa-onnx/csrc/text-utils.h @@ -119,6 +119,8 @@ bool SplitStringToFloats(const std::string &full, const char *delim, template bool ConvertStringToReal(const std::string &str, T *out); +std::vector SplitUtf8(const std::string &text); + } // namespace sherpa_onnx #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_ diff --git a/sherpa-onnx/csrc/utfcpp-test.cc b/sherpa-onnx/csrc/utfcpp-test.cc new file mode 100644 index 000000000..dc9eecc2f --- /dev/null +++ b/sherpa-onnx/csrc/utfcpp-test.cc @@ -0,0 +1,21 @@ +// sherpa-onnx/csrc/utfcpp-test.cc +// +// Copyright (c) 2023 Xiaomi Corporation + +#include +#include + +#include "gtest/gtest.h" +#include "sherpa-onnx/csrc/text-utils.h" + +namespace sherpa_onnx { + +TEST(UTF8, Case1) { + std::string hello = "你好, 早上好!世界. hello!。Hallo"; + std::vector ss = SplitUtf8(hello); + for (const auto &s : ss) { + std::cout << s << "\n"; + } +} + +} // namespace sherpa_onnx From 4a018fb8f8ef82774dad3523ae41f8b76b318719 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 17 Oct 2023 18:02:25 +0800 Subject: [PATCH 2/4] Fix style issues --- sherpa-onnx/csrc/text-utils.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index 052d29006..c08e857d9 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "source/utf8.h" From 52028b60edb6765e8fec6b6f24719c027d2d212b Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 17 Oct 2023 18:09:22 +0800 Subject: [PATCH 3/4] Release v1.8.2 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca6a6276f..183278d1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.8.1") +set(SHERPA_ONNX_VERSION "1.8.2") # Disable warning about # From 6b7a2b7f7546a978a0ce002476c8579f0832cb06 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 17 Oct 2023 22:47:10 +0800 Subject: [PATCH 4/4] minor fixes --- .github/scripts/test-python.sh | 44 ++++++++++++++++++++++++-- .github/workflows/run-python-test.yaml | 2 +- cmake/kaldi-native-fbank.cmake | 16 +++++----- python-api-examples/offline-tts.py | 15 +++++++++ 4 files changed, 65 insertions(+), 12 deletions(-) diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index e6f37764a..6567dd59e 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -9,6 +9,10 @@ log() { } log "Offline TTS test" +# test waves are saved in ./tts +mkdir ./tts + +log "vits-ljs test" wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt @@ -18,14 +22,48 @@ python3 ./python-api-examples/offline-tts.py \ --vits-model=./vits-ljs.onnx \ --vits-lexicon=./lexicon.txt \ --vits-tokens=./tokens.txt \ - --output-filename=./tts.wav \ + --output-filename=./tts/vits-ljs.wav \ 'liliana, the most beautiful and lovely assistant of our team!' -ls -lh ./tts.wav -file ./tts.wav +ls -lh ./tts rm -v vits-ljs.onnx ./lexicon.txt ./tokens.txt +log "vits-vctk test" +wget -qq https://huggingface.co/csukuangfj/vits-vctk/resolve/main/vits-vctk.onnx +wget -qq https://huggingface.co/csukuangfj/vits-vctk/resolve/main/lexicon.txt +wget -qq https://huggingface.co/csukuangfj/vits-vctk/resolve/main/tokens.txt + +for sid in 0 10 90; do + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-vctk.onnx \ + --vits-lexicon=./lexicon.txt \ + --vits-tokens=./tokens.txt \ + --sid=$sid \ + --output-filename=./tts/vits-vctk-${sid}.wav \ + 'liliana, the most beautiful and lovely assistant of our team!' +done + +rm -v vits-vctk.onnx ./lexicon.txt ./tokens.txt + +log "vits-zh-aishell3" + +wget -qq https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/vits-aishell3.onnx +wget -qq https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/lexicon.txt +wget -qq https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/tokens.txt + +for sid in 0 10 90; do + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-aishell3.onnx \ + --vits-lexicon=./lexicon.txt \ + --vits-tokens=./tokens.txt \ + --sid=$sid \ + --output-filename=./tts/vits-aishell3-${sid}.wav \ + '林美丽最美丽' +done + +rm -v vits-aishell3.onnx ./lexicon.txt ./tokens.txt + mkdir -p /tmp/icefall-models dir=/tmp/icefall-models diff --git a/.github/workflows/run-python-test.yaml b/.github/workflows/run-python-test.yaml index 56f98a9ff..e395c0210 100644 --- a/.github/workflows/run-python-test.yaml +++ b/.github/workflows/run-python-test.yaml @@ -69,4 +69,4 @@ jobs: - uses: actions/upload-artifact@v3 with: name: tts-generated-test-files - path: tts.wav + path: tts diff --git a/cmake/kaldi-native-fbank.cmake b/cmake/kaldi-native-fbank.cmake index d561ce882..38751b67c 100644 --- a/cmake/kaldi-native-fbank.cmake +++ b/cmake/kaldi-native-fbank.cmake @@ -1,9 +1,9 @@ function(download_kaldi_native_fbank) include(FetchContent) - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.18.1.tar.gz") - set(kaldi_native_fbank_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.18.1.tar.gz") - set(kaldi_native_fbank_HASH "SHA256=c7676f319fa97e8c8bca6018792de120895dcfe122fa9b4bff00f8f9165348e7") + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.18.5.tar.gz") + set(kaldi_native_fbank_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.18.5.tar.gz") + set(kaldi_native_fbank_HASH "SHA256=dce0cb3bc6fece5d8053d8780cb4ce22da57cb57ebec332641661521a0425283") set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) # If you don't have access to the Internet, # please pre-download kaldi-native-fbank set(possible_file_locations - $ENV{HOME}/Downloads/kaldi-native-fbank-1.18.1.tar.gz - ${PROJECT_SOURCE_DIR}/kaldi-native-fbank-1.18.1.tar.gz - ${PROJECT_BINARY_DIR}/kaldi-native-fbank-1.18.1.tar.gz - /tmp/kaldi-native-fbank-1.18.1.tar.gz - /star-fj/fangjun/download/github/kaldi-native-fbank-1.18.1.tar.gz + $ENV{HOME}/Downloads/kaldi-native-fbank-1.18.5.tar.gz + ${PROJECT_SOURCE_DIR}/kaldi-native-fbank-1.18.5.tar.gz + ${PROJECT_BINARY_DIR}/kaldi-native-fbank-1.18.5.tar.gz + /tmp/kaldi-native-fbank-1.18.5.tar.gz + /star-fj/fangjun/download/github/kaldi-native-fbank-1.18.5.tar.gz ) foreach(f IN LISTS possible_file_locations) diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py index 85e588040..f36ea6f49 100755 --- a/python-api-examples/offline-tts.py +++ b/python-api-examples/offline-tts.py @@ -20,9 +20,14 @@ --vits-tokens=./tokens.txt \ --output-filename=./generated.wav \ 'liliana, the most beautiful and lovely assistant of our team!' + +Please see +https://k2-fsa.github.io/sherpa/onnx/tts/index.html +for details. """ import argparse +import time import sherpa_onnx import soundfile as sf @@ -115,7 +120,14 @@ def main(): ) ) tts = sherpa_onnx.OfflineTts(tts_config) + + start = time.time() audio = tts.generate(args.text, sid=args.sid) + end = time.time() + elapsed_seconds = end - start + audio_duration = len(audio.samples) / audio.sample_rate + real_time_factor = elapsed_seconds / audio_duration + sf.write( args.output_filename, audio.samples, @@ -124,6 +136,9 @@ def main(): ) print(f"Saved to {args.output_filename}") print(f"The text is '{args.text}'") + print(f"Elapsed seconds: {elapsed_seconds:.3f}") + print(f"Audio duration in seconds: {audio_duration:.3f}") + print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") if __name__ == "__main__":