diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index 755cde74b..d3e85f687 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -85,6 +85,25 @@ fi echo "----------tts----------" +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test_tts_non_streaming_matcha_icefall_en.js +rm hifigan_v2.onnx +rm -rf matcha-icefall-en_US-ljspeech + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test_tts_non_streaming_matcha_icefall_zh.js +rm hifigan_v2.onnx +rm -rf matcha-icefall-zh-baker +ls -lh *.wav + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 tar xf vits-piper-en_GB-cori-medium.tar.bz2 rm vits-piper-en_GB-cori-medium.tar.bz2 diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc index 05e27846d..7baf3ce8b 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc @@ -31,6 +31,28 @@ static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig( return c; } +static SherpaOnnxOfflineTtsMatchaModelConfig GetOfflineTtsMatchaModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineTtsMatchaModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("matcha") || !obj.Get("matcha").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("matcha").As(); + SHERPA_ONNX_ASSIGN_ATTR_STR(acoustic_model, acousticModel); + SHERPA_ONNX_ASSIGN_ATTR_STR(vocoder, vocoder); + SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon); + SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); + SHERPA_ONNX_ASSIGN_ATTR_STR(data_dir, dataDir); + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(noise_scale, noiseScale); + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale); + SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir); + + return c; +} + static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( Napi::Object obj) { SherpaOnnxOfflineTtsModelConfig c; @@ -43,6 +65,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( Napi::Object o = obj.Get("model").As(); c.vits = GetOfflineTtsVitsModelConfig(o); + c.matcha = GetOfflineTtsMatchaModelConfig(o); SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); @@ -107,9 +130,10 @@ static Napi::External CreateOfflineTtsWrapper( decltype(&OH_ResourceManager_ReleaseNativeResourceManager)> mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]), &OH_ResourceManager_ReleaseNativeResourceManager); - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get()); + const SherpaOnnxOfflineTts *tts = + SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get()); #else - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c); + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c); #endif if (c.model.vits.model) { @@ -132,6 +156,30 @@ static Napi::External CreateOfflineTtsWrapper( delete[] c.model.vits.dict_dir; } + if (c.model.matcha.acoustic_model) { + delete[] c.model.matcha.acoustic_model; + } + + if (c.model.matcha.vocoder) { + delete[] c.model.matcha.vocoder; + } + + if (c.model.matcha.lexicon) { + delete[] c.model.matcha.lexicon; + } + + if (c.model.matcha.tokens) { + delete[] c.model.matcha.tokens; + } + + if (c.model.matcha.data_dir) { + delete[] c.model.matcha.data_dir; + } + + if (c.model.matcha.dict_dir) { + delete[] c.model.matcha.dict_dir; + } + if (c.model.provider) { delete[] c.model.provider; } @@ -152,7 +200,8 @@ static Napi::External CreateOfflineTtsWrapper( } return Napi::External::New( - env, tts, [](Napi::Env env, SherpaOnnxOfflineTts *tts) { + env, const_cast(tts), + [](Napi::Env env, SherpaOnnxOfflineTts *tts) { SherpaOnnxDestroyOfflineTts(tts); }); } diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index f436bcae2..ec2f23da2 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -133,6 +133,8 @@ The following tables list the examples in this folder. |File| Description| |---|---| +|[./test_tts_non_streaming_matcha_icefall_en.js](./test_tts_non_streaming_matcha_icefall_en.js)| Text-to-speech with a [MatchaTTS English Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)| +|[./test_tts_non_streaming_matcha_icefall_zhjs](./test_tts_non_streaming_matcha_icefall_zh.js)| Text-to-speech with a [MatchaTTS Chinese Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker)| |[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model| |[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model| |[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)| @@ -345,6 +347,28 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_sense_voice_microphone.js ``` +### Text-to-speech with MatchaTTS models (English TTS) +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test_tts_non_streaming_matcha_icefall_en.js +``` + +### Text-to-speech with MatchaTTS models (Chinese TTS) +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test_tts_non_streaming_matcha_icefall_zh.js +``` + ### Text-to-speech with piper VITS models (TTS) ```bash diff --git a/nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_en.js b/nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_en.js new file mode 100644 index 000000000..8c45d1dd3 --- /dev/null +++ b/nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_en.js @@ -0,0 +1,48 @@ +// Copyright (c) 2025 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// please refer to +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +// to download model files +function createOfflineTts() { + const config = { + model: { + matcha: { + acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx', + vocoder: './hifigan_v2.onnx', + lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt', + tokens: './matcha-icefall-en_US-ljspeech/tokens.txt', + dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }, + maxNumSentences: 1, + }; + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); + +const text = + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' + + +let start = Date.now(); +const audio = tts.generate({text: text, sid: 0, speed: 1.0}); +let stop = Date.now(); +const elapsed_seconds = (stop - start) / 1000; +const duration = audio.samples.length / audio.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +const filename = 'test-matcha-en.wav'; +sherpa_onnx.writeWave( + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); + +console.log(`Saved to ${filename}`); diff --git a/nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_zh.js b/nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_zh.js new file mode 100644 index 000000000..1f667e3d2 --- /dev/null +++ b/nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_zh.js @@ -0,0 +1,50 @@ +// Copyright (c) 2025 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// please refer to +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +// to download model files +function createOfflineTts() { + const config = { + model: { + matcha: { + acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx', + vocoder: './hifigan_v2.onnx', + lexicon: './matcha-icefall-zh-baker/lexicon.txt', + tokens: './matcha-icefall-zh-baker/tokens.txt', + dictDir: './matcha-icefall-zh-baker/dict', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }, + maxNumSentences: 1, + ruleFsts: + './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst', + }; + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); + +const text = + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' + + +let start = Date.now(); +const audio = tts.generate({text: text, sid: 0, speed: 1.0}); +let stop = Date.now(); +const elapsed_seconds = (stop - start) / 1000; +const duration = audio.samples.length / audio.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +const filename = 'test-matcha-zh.wav'; +sherpa_onnx.writeWave( + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); + +console.log(`Saved to ${filename}`); diff --git a/scripts/node-addon-api/package.json b/scripts/node-addon-api/package.json index f0bb57d0d..201cb77de 100644 --- a/scripts/node-addon-api/package.json +++ b/scripts/node-addon-api/package.json @@ -3,7 +3,7 @@ "version": "1.0.0", "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", "dependencies": { - "cmake-js": "^7.0.0", + "cmake-js": "^7.3.0", "node-addon-api": "^8.3.0", "perf_hooks": "*" },