diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index 18f7c257d6..1549b484df 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -101,6 +101,10 @@ jobs: ./matcha-tts-zh-c-api + rm ./matcha-tts-zh-c-api + rm -rf matcha-icefall-* + rm hifigan_v2.onnx + - name: Test Matcha TTS (en) shell: bash run: | @@ -121,6 +125,10 @@ jobs: ./matcha-tts-en-c-api + rm ./matcha-tts-en-c-api + rm -rf matcha-icefall-* + rm hifigan_v2.onnx + - uses: actions/upload-artifact@v4 with: name: matcha-tts-${{ matrix.os }} diff --git a/.github/workflows/cxx-api.yaml b/.github/workflows/cxx-api.yaml index 8779011a95..2fdc563134 100644 --- a/.github/workflows/cxx-api.yaml +++ b/.github/workflows/cxx-api.yaml @@ -83,6 +83,61 @@ jobs: otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib fi + - name: Test Matcha TTS (zh) + shell: bash + run: | + g++ -std=c++17 -o matcha-tts-zh-cxx-api ./cxx-api-examples/matcha-tts-zh-cxx-api.cc \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-cxx-api \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./matcha-tts-zh-cxx-api + + rm -rf matcha-icefall-* + rm hifigan_v2.onnx + rm matcha-tts-zh-cxx-api + + - name: Test Matcha TTS (en) + shell: bash + run: | + g++ -std=c++17 -o matcha-tts-en-cxx-api ./cxx-api-examples/matcha-tts-en-cxx-api.cc \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-cxx-api \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./matcha-tts-en-cxx-api + + rm matcha-tts-en-cxx-api + rm -rf matcha-icefall-* + rm hifigan_v2.onnx + + - uses: actions/upload-artifact@v4 + with: + name: matcha-tts-${{ matrix.os }} + path: ./generated-matcha-*.wav + - name: Test Moonshine tiny shell: bash run: | diff --git a/c-api-examples/matcha-tts-en-c-api.c b/c-api-examples/matcha-tts-en-c-api.c index 103ecd5237..99b0a9742d 100644 --- a/c-api-examples/matcha-tts-en-c-api.c +++ b/c-api-examples/matcha-tts-en-c-api.c @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { "Friends fell out often because life was changing so fast. The easiest " "thing in the world was to lose touch with someone."; - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); int32_t sid = 0; float speed = 1.0; // larger -> faster in speech speed diff --git a/c-api-examples/matcha-tts-zh-c-api.c b/c-api-examples/matcha-tts-zh-c-api.c index c7667f0cb3..9fb9f4597d 100644 --- a/c-api-examples/matcha-tts-zh-c-api.c +++ b/c-api-examples/matcha-tts-zh-c-api.c @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); int32_t sid = 0; float speed = 1.0; // larger -> faster in speech speed diff --git a/c-api-examples/offline-tts-c-api.c b/c-api-examples/offline-tts-c-api.c index 7fbdb004ca..eaa25af392 100644 --- a/c-api-examples/offline-tts-c-api.c +++ b/c-api-examples/offline-tts-c-api.c @@ -229,7 +229,7 @@ int32_t main(int32_t argc, char *argv[]) { ShowUsage(); } - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); const SherpaOnnxGeneratedAudio *audio = SherpaOnnxOfflineTtsGenerate(tts, text, sid, 1.0); diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index cc4082e87c..dd61d3294e 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -14,3 +14,11 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) + +if(SHERPA_ONNX_ENABLE_TTS) + add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc) + target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api) + + add_executable(matcha-tts-en-cxx-api ./matcha-tts-en-cxx-api.cc) + target_link_libraries(matcha-tts-en-cxx-api sherpa-onnx-cxx-api) +endif() diff --git a/cxx-api-examples/matcha-tts-en-cxx-api.cc b/cxx-api-examples/matcha-tts-en-cxx-api.cc new file mode 100644 index 0000000000..ef4187d060 --- /dev/null +++ b/cxx-api-examples/matcha-tts-en-cxx-api.cc @@ -0,0 +1,80 @@ +// cxx-api-examples/matcha-tts-en-cxx-api.c +// +// Copyright (c) 2025 Xiaomi Corporation + +// This file shows how to use sherpa-onnx CXX API +// for Chinese TTS with MatchaTTS. +// +// clang-format off +/* +Usage + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +./matcha-tts-en-cxx-api + + */ +// clang-format on + +#include + +#include "sherpa-onnx/c-api/cxx-api.h" + +static int32_t ProgressCallback(const float *samples, int32_t num_samples, + float progress, void *arg) { + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); + // return 1 to continue generating + // return 0 to stop generating + return 1; +} + +int32_t main(int32_t argc, char *argv[]) { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineTtsConfig config; + + config.model.matcha.acoustic_model = + "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"; + + config.model.matcha.vocoder = "./hifigan_v2.onnx"; + + config.model.matcha.tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"; + + config.model.matcha.data_dir = + "./matcha-icefall-en_US-ljspeech/espeak-ng-data"; + + config.model.num_threads = 1; + + // If you don't want to see debug messages, please set it to 0 + config.model.debug = 1; + + std::string filename = "./generated-matcha-en-cxx.wav"; + std::string text = + "Today as always, men fall into two groups: slaves and free men. Whoever " + "does not have two-thirds of his day for himself, is a slave, whatever " + "he may be: a statesman, a businessman, an official, or a scholar. " + "Friends fell out often because life was changing so fast. The easiest " + "thing in the world was to lose touch with someone."; + + auto tts = OfflineTts::Create(config); + int32_t sid = 0; + float speed = 1.0; // larger -> faster in speech speed + +#if 0 + // If you don't want to use a callback, then please enable this branch + GeneratedAudio audio = tts.Generate(text, sid, speed); +#else + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback); +#endif + + WriteWave(filename, {audio.samples, audio.sample_rate}); + + fprintf(stderr, "Input text is: %s\n", text.c_str()); + fprintf(stderr, "Speaker ID is is: %d\n", sid); + fprintf(stderr, "Saved to: %s\n", filename.c_str()); + + return 0; +} diff --git a/cxx-api-examples/matcha-tts-zh-cxx-api.cc b/cxx-api-examples/matcha-tts-zh-cxx-api.cc new file mode 100644 index 0000000000..f63065994f --- /dev/null +++ b/cxx-api-examples/matcha-tts-zh-cxx-api.cc @@ -0,0 +1,79 @@ +// cxx-api-examples/matcha-tts-zh-cxx-api.c +// +// Copyright (c) 2025 Xiaomi Corporation + +// This file shows how to use sherpa-onnx CXX API +// for Chinese TTS with MatchaTTS. +// +// clang-format off +/* +Usage + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +./matcha-tts-zh-cxx-api + + */ +// clang-format on + +#include + +#include "sherpa-onnx/c-api/cxx-api.h" + +static int32_t ProgressCallback(const float *samples, int32_t num_samples, + float progress, void *arg) { + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); + // return 1 to continue generating + // return 0 to stop generating + return 1; +} + +int32_t main(int32_t argc, char *argv[]) { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineTtsConfig config; + config.model.matcha.acoustic_model = + "./matcha-icefall-zh-baker/model-steps-3.onnx"; + config.model.matcha.vocoder = "./hifigan_v2.onnx"; + config.model.matcha.lexicon = "./matcha-icefall-zh-baker/lexicon.txt"; + config.model.matcha.tokens = "./matcha-icefall-zh-baker/tokens.txt"; + config.model.matcha.dict_dir = "./matcha-icefall-zh-baker/dict"; + config.model.num_threads = 1; + + // If you don't want to see debug messages, please set it to 0 + config.model.debug = 1; + + // clang-format off + config.rule_fsts = "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"; // NOLINT + // clang-format on + + std::string filename = "./generated-matcha-zh-cxx.wav"; + std::string text = + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如" + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感" + "受着生命的奇迹与温柔." + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " + "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; + + auto tts = OfflineTts::Create(config); + int32_t sid = 0; + float speed = 1.0; // larger -> faster in speech speed + +#if 0 + // If you don't want to use a callback, then please enable this branch + GeneratedAudio audio = tts.Generate(text, sid, speed); +#else + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback); +#endif + + WriteWave(filename, {audio.samples, audio.sample_rate}); + + fprintf(stderr, "Input text is: %s\n", text.c_str()); + fprintf(stderr, "Speaker ID is is: %d\n", sid); + fprintf(stderr, "Saved to: %s\n", filename.c_str()); + + return 0; +} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 6afc1bf627..584f933217 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1114,7 +1114,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( return tts_config; } -SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( +const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( const SherpaOnnxOfflineTtsConfig *config) { auto tts_config = GetOfflineTtsConfig(config); @@ -1130,7 +1130,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( return tts; } -void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; } +void SherpaOnnxDestroyOfflineTts(const SherpaOnnxOfflineTts *tts) { + delete tts; +} int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) { return tts->impl->SampleRate(); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index e79d951e22..691b92a3bd 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -950,11 +950,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; // Create an instance of offline TTS. The user has to use DestroyOfflineTts() // to free the returned pointer to avoid memory leak. -SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( +SHERPA_ONNX_API const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( const SherpaOnnxOfflineTtsConfig *config); // Free the pointer returned by SherpaOnnxCreateOfflineTts() -SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts( + const SherpaOnnxOfflineTts *tts); // Return the sample rate of the current TTS object SHERPA_ONNX_API int32_t @@ -984,7 +985,6 @@ SHERPA_ONNX_API const SherpaOnnxGeneratedAudio * SherpaOnnxOfflineTtsGenerateWithProgressCallback( const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, - SherpaOnnxGeneratedAudioProgressCallback callback); SHERPA_ONNX_API diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index c66221f0ea..0b19e112e3 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -24,6 +24,11 @@ Wave ReadWave(const std::string &filename) { return ans; } +bool WriteWave(const std::string &filename, const Wave &wave) { + return SherpaOnnxWriteWave(wave.samples.data(), wave.samples.size(), + wave.sample_rate, filename.c_str()); +} + OnlineStream::OnlineStream(const SherpaOnnxOnlineStream *p) : MoveOnly(p) {} @@ -311,4 +316,73 @@ OfflineRecognizerResult OfflineRecognizer::GetResult( return ans; } +OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { + struct SherpaOnnxOfflineTtsConfig c; + memset(&c, 0, sizeof(c)); + + c.model.vits.model = config.model.vits.model.c_str(); + c.model.vits.lexicon = config.model.vits.lexicon.c_str(); + c.model.vits.tokens = config.model.vits.tokens.c_str(); + c.model.vits.data_dir = config.model.vits.data_dir.c_str(); + c.model.vits.noise_scale = config.model.vits.noise_scale; + c.model.vits.noise_scale_w = config.model.vits.noise_scale_w; + c.model.vits.length_scale = config.model.vits.length_scale; + c.model.vits.dict_dir = config.model.vits.dict_dir.c_str(); + + c.model.matcha.acoustic_model = config.model.matcha.acoustic_model.c_str(); + c.model.matcha.vocoder = config.model.matcha.vocoder.c_str(); + c.model.matcha.lexicon = config.model.matcha.lexicon.c_str(); + c.model.matcha.tokens = config.model.matcha.tokens.c_str(); + c.model.matcha.data_dir = config.model.matcha.data_dir.c_str(); + c.model.matcha.noise_scale = config.model.matcha.noise_scale; + c.model.matcha.length_scale = config.model.matcha.length_scale; + c.model.matcha.dict_dir = config.model.matcha.dict_dir.c_str(); + + c.model.num_threads = config.model.num_threads; + c.model.debug = config.model.debug; + c.model.provider = config.model.provider.c_str(); + + c.rule_fsts = config.rule_fsts.c_str(); + c.max_num_sentences = config.max_num_sentences; + c.rule_fars = config.rule_fars.c_str(); + + auto p = SherpaOnnxCreateOfflineTts(&c); + return OfflineTts(p); +} + +OfflineTts::OfflineTts(const SherpaOnnxOfflineTts *p) + : MoveOnly(p) {} + +void OfflineTts::Destroy(const SherpaOnnxOfflineTts *p) const { + SherpaOnnxDestroyOfflineTts(p); +} + +int32_t OfflineTts::SampleRate() const { + return SherpaOnnxOfflineTtsSampleRate(p_); +} + +int32_t OfflineTts::NumSpeakers() const { + return SherpaOnnxOfflineTtsNumSpeakers(p_); +} + +GeneratedAudio OfflineTts::Generate(const std::string &text, + int32_t sid /*= 0*/, float speed /*= 1.0*/, + OfflineTtsCallback callback /*= nullptr*/, + void *arg /*= nullptr*/) const { + const SherpaOnnxGeneratedAudio *audio; + if (!callback) { + audio = SherpaOnnxOfflineTtsGenerate(p_, text.c_str(), sid, speed); + } else { + audio = SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg( + p_, text.c_str(), sid, speed, callback, arg); + } + + GeneratedAudio ans; + ans.samples = std::vector{audio->samples, audio->samples + audio->n}; + ans.sample_rate = audio->sample_rate; + + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); + return ans; +} + } // namespace sherpa_onnx::cxx diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index 2a476efa18..12932f3f2e 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -97,6 +97,10 @@ struct Wave { SHERPA_ONNX_API Wave ReadWave(const std::string &filename); +// Return true on success; +// Return false on failure +SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave); + template class SHERPA_ONNX_API MoveOnly { public: @@ -307,6 +311,91 @@ class SHERPA_ONNX_API OfflineRecognizer explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p); }; +// ============================================================================ +// Non-streaming TTS +// ============================================================================ +struct OfflineTtsVitsModelConfig { + std::string model; + std::string lexicon; + std::string tokens; + std::string data_dir; + std::string dict_dir; + + float noise_scale = 0.667; + float noise_scale_w = 0.8; + float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed +}; + +struct OfflineTtsMatchaModelConfig { + std::string acoustic_model; + std::string vocoder; + std::string lexicon; + std::string tokens; + std::string data_dir; + std::string dict_dir; + + float noise_scale = 0.667; + float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed +}; + +struct OfflineTtsModelConfig { + OfflineTtsVitsModelConfig vits; + OfflineTtsMatchaModelConfig matcha; + int32_t num_threads = 1; + bool debug = false; + std::string provider = "cpu"; +}; + +struct OfflineTtsConfig { + OfflineTtsModelConfig model; + std::string rule_fsts; + std::string rule_fars; + int32_t max_num_sentences = 1; +}; + +struct GeneratedAudio { + std::vector samples; // in the range [-1, 1] + int32_t sample_rate; +}; + +// Return 1 to continue generating +// Return 0 to stop generating +using OfflineTtsCallback = int32_t (*)(const float *samples, + int32_t num_samples, float progress, + void *arg); + +class SHERPA_ONNX_API OfflineTts + : public MoveOnly { + public: + static OfflineTts Create(const OfflineTtsConfig &config); + + void Destroy(const SherpaOnnxOfflineTts *p) const; + + // Return the sample rate of the generated audio + int32_t SampleRate() const; + + // Number of supported speakers. + // If it supports only a single speaker, then it return 0 or 1. + int32_t NumSpeakers() const; + + // @param text A string containing words separated by spaces + // @param sid Speaker ID. Used only for multi-speaker models, e.g., models + // trained using the VCTK dataset. It is not used for + // single-speaker models, e.g., models trained using the ljspeech + // dataset. + // @param speed The speed for the generated speech. E.g., 2 means 2x faster. + // @param callback If not NULL, it is called whenever config.max_num_sentences + // sentences have been processed. The callback is called in + // the current thread. + GeneratedAudio Generate(const std::string &text, int32_t sid = 0, + float speed = 1.0, + OfflineTtsCallback callback = nullptr, + void *arg = nullptr) const; + + private: + explicit OfflineTts(const SherpaOnnxOfflineTts *p); +}; + } // namespace sherpa_onnx::cxx #endif // SHERPA_ONNX_C_API_CXX_API_H_