Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JavaScript API (node-addon-api) for MatchaTTS models. #1677

Merged
merged 1 commit into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/scripts/test-nodejs-addon-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,25 @@ fi

echo "----------tts----------"

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test_tts_non_streaming_matcha_icefall_en.js
rm hifigan_v2.onnx
rm -rf matcha-icefall-en_US-ljspeech

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test_tts_non_streaming_matcha_icefall_zh.js
rm hifigan_v2.onnx
rm -rf matcha-icefall-zh-baker
ls -lh *.wav

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
tar xf vits-piper-en_GB-cori-medium.tar.bz2
rm vits-piper-en_GB-cori-medium.tar.bz2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,28 @@ static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig(
return c;
}

static SherpaOnnxOfflineTtsMatchaModelConfig GetOfflineTtsMatchaModelConfig(
Napi::Object obj) {
SherpaOnnxOfflineTtsMatchaModelConfig c;
memset(&c, 0, sizeof(c));

if (!obj.Has("matcha") || !obj.Get("matcha").IsObject()) {
return c;
}

Napi::Object o = obj.Get("matcha").As<Napi::Object>();
SHERPA_ONNX_ASSIGN_ATTR_STR(acoustic_model, acousticModel);
SHERPA_ONNX_ASSIGN_ATTR_STR(vocoder, vocoder);
SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon);
SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
SHERPA_ONNX_ASSIGN_ATTR_STR(data_dir, dataDir);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(noise_scale, noiseScale);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale);
SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir);

return c;
}

static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig(
Napi::Object obj) {
SherpaOnnxOfflineTtsModelConfig c;
Expand All @@ -43,6 +65,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig(
Napi::Object o = obj.Get("model").As<Napi::Object>();

c.vits = GetOfflineTtsVitsModelConfig(o);
c.matcha = GetOfflineTtsMatchaModelConfig(o);

SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);

Expand Down Expand Up @@ -107,9 +130,10 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
decltype(&OH_ResourceManager_ReleaseNativeResourceManager)>
mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]),
&OH_ResourceManager_ReleaseNativeResourceManager);
SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get());
const SherpaOnnxOfflineTts *tts =
SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get());
#else
SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c);
const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c);
#endif

if (c.model.vits.model) {
Expand All @@ -132,6 +156,30 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
delete[] c.model.vits.dict_dir;
}

if (c.model.matcha.acoustic_model) {
delete[] c.model.matcha.acoustic_model;
}

if (c.model.matcha.vocoder) {
delete[] c.model.matcha.vocoder;
}

if (c.model.matcha.lexicon) {
delete[] c.model.matcha.lexicon;
}

if (c.model.matcha.tokens) {
delete[] c.model.matcha.tokens;
}

if (c.model.matcha.data_dir) {
delete[] c.model.matcha.data_dir;
}

if (c.model.matcha.dict_dir) {
delete[] c.model.matcha.dict_dir;
}

if (c.model.provider) {
delete[] c.model.provider;
}
Expand All @@ -152,7 +200,8 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
}

return Napi::External<SherpaOnnxOfflineTts>::New(
env, tts, [](Napi::Env env, SherpaOnnxOfflineTts *tts) {
env, const_cast<SherpaOnnxOfflineTts *>(tts),
[](Napi::Env env, SherpaOnnxOfflineTts *tts) {
SherpaOnnxDestroyOfflineTts(tts);
});
}
Expand Down
24 changes: 24 additions & 0 deletions nodejs-addon-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ The following tables list the examples in this folder.

|File| Description|
|---|---|
|[./test_tts_non_streaming_matcha_icefall_en.js](./test_tts_non_streaming_matcha_icefall_en.js)| Text-to-speech with a [MatchaTTS English Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)|
|[./test_tts_non_streaming_matcha_icefall_zhjs](./test_tts_non_streaming_matcha_icefall_zh.js)| Text-to-speech with a [MatchaTTS Chinese Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker)|
|[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model|
|[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model|
|[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)|
Expand Down Expand Up @@ -345,6 +347,28 @@ npm install naudiodon2
node ./test_vad_asr_non_streaming_sense_voice_microphone.js
```

### Text-to-speech with MatchaTTS models (English TTS)
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test_tts_non_streaming_matcha_icefall_en.js
```

### Text-to-speech with MatchaTTS models (Chinese TTS)
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test_tts_non_streaming_matcha_icefall_zh.js
```

### Text-to-speech with piper VITS models (TTS)

```bash
Expand Down
48 changes: 48 additions & 0 deletions nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_en.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright (c) 2025 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx-node');

// please refer to
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
// to download model files
function createOfflineTts() {
const config = {
model: {
matcha: {
acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx',
vocoder: './hifigan_v2.onnx',
lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt',
tokens: './matcha-icefall-en_US-ljspeech/tokens.txt',
dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data',
},
debug: true,
numThreads: 1,
provider: 'cpu',
},
maxNumSentences: 1,
};
return new sherpa_onnx.OfflineTts(config);
}

const tts = createOfflineTts();

const text =
'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'


let start = Date.now();
const audio = tts.generate({text: text, sid: 0, speed: 1.0});
let stop = Date.now();
const elapsed_seconds = (stop - start) / 1000;
const duration = audio.samples.length / audio.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'secodns')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
console.log(
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
real_time_factor.toFixed(3))

const filename = 'test-matcha-en.wav';
sherpa_onnx.writeWave(
filename, {samples: audio.samples, sampleRate: audio.sampleRate});

console.log(`Saved to ${filename}`);
50 changes: 50 additions & 0 deletions nodejs-addon-examples/test_tts_non_streaming_matcha_icefall_zh.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (c) 2025 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx-node');

// please refer to
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
// to download model files
function createOfflineTts() {
const config = {
model: {
matcha: {
acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx',
vocoder: './hifigan_v2.onnx',
lexicon: './matcha-icefall-zh-baker/lexicon.txt',
tokens: './matcha-icefall-zh-baker/tokens.txt',
dictDir: './matcha-icefall-zh-baker/dict',
},
debug: true,
numThreads: 1,
provider: 'cpu',
},
maxNumSentences: 1,
ruleFsts:
'./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst',
};
return new sherpa_onnx.OfflineTts(config);
}

const tts = createOfflineTts();

const text =
'当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'


let start = Date.now();
const audio = tts.generate({text: text, sid: 0, speed: 1.0});
let stop = Date.now();
const elapsed_seconds = (stop - start) / 1000;
const duration = audio.samples.length / audio.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'secodns')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
console.log(
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
real_time_factor.toFixed(3))

const filename = 'test-matcha-zh.wav';
sherpa_onnx.writeWave(
filename, {samples: audio.samples, sampleRate: audio.sampleRate});

console.log(`Saved to ${filename}`);
2 changes: 1 addition & 1 deletion scripts/node-addon-api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"version": "1.0.0",
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
"dependencies": {
"cmake-js": "^7.0.0",
"cmake-js": "^7.3.0",
"node-addon-api": "^8.3.0",
"perf_hooks": "*"
},
Expand Down
Loading