From 77fdfe233cadaa3c00ebe8a0ad7f008357c739c4 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 26 Jul 2024 12:42:08 +0800 Subject: [PATCH] Add VAD + Non-streaming ASR example for JavaScript API. (#1170) --- .github/scripts/test-nodejs-addon-npm.sh | 13 ++ .gitignore | 1 + CHANGELOG.md | 5 + CMakeLists.txt | 2 +- .../keyword-spotter/pubspec.yaml | 2 +- .../non-streaming-asr/pubspec.yaml | 2 +- dart-api-examples/streaming-asr/pubspec.yaml | 2 +- dart-api-examples/tts/pubspec.yaml | 2 +- dart-api-examples/vad/pubspec.yaml | 2 +- flutter-examples/streaming_asr/pubspec.yaml | 4 +- flutter-examples/tts/pubspec.yaml | 4 +- flutter/sherpa_onnx/pubspec.yaml | 12 +- .../ios/sherpa_onnx_ios.podspec | 2 +- .../macos/sherpa_onnx_macos.podspec | 2 +- nodejs-addon-examples/README.md | 16 ++- nodejs-addon-examples/package.json | 2 +- ...test_vad_with_non_streaming_asr_whisper.js | 127 ++++++++++++++++++ scripts/dart/kws-pubspec.yaml | 2 +- scripts/dart/sherpa-onnx-pubspec.yaml | 2 +- scripts/node-addon-api/lib/vad.js | 6 +- sherpa-onnx/csrc/offline-stream.cc | 5 +- sherpa-onnx/csrc/online-recognizer.cc | 6 +- 22 files changed, 189 insertions(+), 32 deletions(-) create mode 100644 nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index 867c0f022..a46e2de8e 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -10,6 +10,19 @@ arch=$(node -p "require('os').arch()") platform=$(node -p "require('os').platform()") node_version=$(node -p "process.versions.node.split('.')[0]") +echo "----------non-streaming asr + vad----------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +rm sherpa-onnx-whisper-tiny.en.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +node ./test_vad_with_non_streaming_asr_whisper.js +rm -rf sherpa-onnx-whisper* +rm *.wav +rm *.onnx + echo "----------asr----------" if [[ $arch != "ia32" && $platform != "win32" ]]; then diff --git a/.gitignore b/.gitignore index 5486ad51a..6260eff6d 100644 --- a/.gitignore +++ b/.gitignore @@ -112,3 +112,4 @@ sherpa-onnx-telespeech-ctc-* .ccache lib*.a sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 +*.bak diff --git a/CHANGELOG.md b/CHANGELOG.md index 337ac7deb..cbae2dbee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 1.10.18 + +* Fix the case when recognition results contain the symbol `"`. It caused + issues when converting results to a json string. + ## 1.10.17 * Support SenseVoice CTC models. diff --git a/CMakeLists.txt b/CMakeLists.txt index d1022d2a4..d8b576a24 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ project(sherpa-onnx) # ./nodejs-addon-examples # ./dart-api-examples/ # ./CHANGELOG.md -set(SHERPA_ONNX_VERSION "1.10.17") +set(SHERPA_ONNX_VERSION "1.10.18") # Disable warning about # diff --git a/dart-api-examples/keyword-spotter/pubspec.yaml b/dart-api-examples/keyword-spotter/pubspec.yaml index b95dcf728..919be610b 100644 --- a/dart-api-examples/keyword-spotter/pubspec.yaml +++ b/dart-api-examples/keyword-spotter/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.17 + sherpa_onnx: ^1.10.18 # sherpa_onnx: # path: ../../flutter/sherpa_onnx path: ^1.9.0 diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml index 5821f09ff..4b49a1f54 100644 --- a/dart-api-examples/non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.17 + sherpa_onnx: ^1.10.18 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml index edf64e3bf..6e8491519 100644 --- a/dart-api-examples/streaming-asr/pubspec.yaml +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -11,7 +11,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.17 + sherpa_onnx: ^1.10.18 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml index 68e4bae47..36fca8937 100644 --- a/dart-api-examples/tts/pubspec.yaml +++ b/dart-api-examples/tts/pubspec.yaml @@ -8,7 +8,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.17 + sherpa_onnx: ^1.10.18 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index 91f4c27dc..cc929ab4d 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.17 + sherpa_onnx: ^1.10.18 path: ^1.9.0 args: ^2.5.0 diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml index df6463b06..0e1afda8b 100644 --- a/flutter-examples/streaming_asr/pubspec.yaml +++ b/flutter-examples/streaming_asr/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' -version: 1.10.17 +version: 1.10.18 topics: - speech-recognition @@ -30,7 +30,7 @@ dependencies: record: ^5.1.0 url_launcher: ^6.2.6 - sherpa_onnx: ^1.10.17 + sherpa_onnx: ^1.10.18 # sherpa_onnx: # path: ../../flutter/sherpa_onnx diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml index 72469714d..024877e7e 100644 --- a/flutter-examples/tts/pubspec.yaml +++ b/flutter-examples/tts/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' # Remove this line if you wish to publish to pub.dev -version: 1.10.17 +version: 1.10.18 environment: sdk: '>=3.4.0 <4.0.0' @@ -17,7 +17,7 @@ dependencies: cupertino_icons: ^1.0.6 path_provider: ^2.1.3 path: ^1.9.0 - sherpa_onnx: ^1.10.17 + sherpa_onnx: ^1.10.18 url_launcher: ^6.2.6 audioplayers: ^5.0.0 diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index fa3a1f790..73c236bbb 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec -version: 1.10.17 +version: 1.10.18 homepage: https://github.com/k2-fsa/sherpa-onnx @@ -30,23 +30,23 @@ dependencies: flutter: sdk: flutter - sherpa_onnx_android: ^1.10.17 + sherpa_onnx_android: ^1.10.18 # sherpa_onnx_android: # path: ../sherpa_onnx_android - sherpa_onnx_macos: ^1.10.17 + sherpa_onnx_macos: ^1.10.18 # sherpa_onnx_macos: # path: ../sherpa_onnx_macos - sherpa_onnx_linux: ^1.10.17 + sherpa_onnx_linux: ^1.10.18 # sherpa_onnx_linux: # path: ../sherpa_onnx_linux # - sherpa_onnx_windows: ^1.10.17 + sherpa_onnx_windows: ^1.10.18 # sherpa_onnx_windows: # path: ../sherpa_onnx_windows - sherpa_onnx_ios: ^1.10.17 + sherpa_onnx_ios: ^1.10.18 # sherpa_onnx_ios: # path: ../sherpa_onnx_ios diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec index 54bf687ba..912b272af 100644 --- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec +++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec @@ -7,7 +7,7 @@ # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c Pod::Spec.new do |s| s.name = 'sherpa_onnx_ios' - s.version = '1.10.17' + s.version = '1.10.18' s.summary = 'A new Flutter FFI plugin project.' s.description = <<-DESC A new Flutter FFI plugin project. diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec index 2e645caa1..73de6cce0 100644 --- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec +++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec @@ -4,7 +4,7 @@ # Pod::Spec.new do |s| s.name = 'sherpa_onnx_macos' - s.version = '1.10.17' + s.version = '1.10.18' s.summary = 'sherpa-onnx Flutter FFI plugin project.' s.description = <<-DESC sherpa-onnx Flutter FFI plugin project. diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 04db30825..b979c5126 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -93,6 +93,7 @@ The following tables list the examples in this folder. |---|---| |[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model| |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)| +|[./test_vad_with_non_streaming_asr_whisper.js](./test_vad_with_non_streaming_asr_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper) + [Silero VAD](https://github.com/snakers4/silero-vad)| |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| @@ -221,11 +222,24 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2 node ./test_asr_non_streaming_whisper.js -# To run VAD + non-streaming ASR with Paraformer using a microphone +# To run VAD + non-streaming ASR with Whisper using a microphone npm install naudiodon2 node ./test_vad_asr_non_streaming_whisper_microphone.js ``` +### Non-streaming speech recognition with Whisper + VAD + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +rm sherpa-onnx-whisper-tiny.en.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +node ./test_vad_with_non_streaming_asr_whisper.js +``` + ### Non-streaming speech recognition with NeMo CTC models ```bash diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index e4bb08801..ad15bdbe8 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.10.17" + "sherpa-onnx-node": "^1.10.18" } } diff --git a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js new file mode 100644 index 000000000..20e17db78 --- /dev/null +++ b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js @@ -0,0 +1,127 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'whisper': { + 'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx', + 'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx', + }, + 'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +// please download ./Obama.wav from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const waveFilename = './Obama.wav'; +const wave = sherpa_onnx.readWave(waveFilename); + +if (wave.sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error( + 'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}'); +} + +console.log('Started') +let start = Date.now(); + +const windowSize = vad.config.sileroVad.windowSize; +for (let i = 0; i < wave.samples.length; i += windowSize) { + const thisWindow = wave.samples.subarray(i, i + windowSize); + vad.acceptWaveform(thisWindow); + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform( + {samples: segment.samples, sampleRate: wave.sampleRate}); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } + } +} + +vad.flush(); + +while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform( + {samples: segment.samples, sampleRate: wave.sampleRate}); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } +} + +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) diff --git a/scripts/dart/kws-pubspec.yaml b/scripts/dart/kws-pubspec.yaml index 6a9c2652e..2471c82fe 100644 --- a/scripts/dart/kws-pubspec.yaml +++ b/scripts/dart/kws-pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - # sherpa_onnx: ^1.10.17 + # sherpa_onnx: ^1.10.18 sherpa_onnx: path: ../../flutter/sherpa_onnx path: ^1.9.0 diff --git a/scripts/dart/sherpa-onnx-pubspec.yaml b/scripts/dart/sherpa-onnx-pubspec.yaml index f4563eada..596a3c638 100644 --- a/scripts/dart/sherpa-onnx-pubspec.yaml +++ b/scripts/dart/sherpa-onnx-pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec -version: 1.10.17 +version: 1.10.18 homepage: https://github.com/k2-fsa/sherpa-onnx diff --git a/scripts/node-addon-api/lib/vad.js b/scripts/node-addon-api/lib/vad.js index 3c8681976..a7f7daa24 100644 --- a/scripts/node-addon-api/lib/vad.js +++ b/scripts/node-addon-api/lib/vad.js @@ -65,7 +65,7 @@ config = { } clear() { - addon.VoiceActivityDetectorClearWrapper(this.handle); + addon.voiceActivityDetectorClear(this.handle); } /* @@ -79,11 +79,11 @@ config = { } reset() { - addon.VoiceActivityDetectorResetWrapper(this.handle); + addon.voiceActivityDetectorReset(this.handle); } flush() { - addon.VoiceActivityDetectorFlushWrapper(this.handle); + addon.voiceActivityDetectorFlush(this.handle); } } diff --git a/sherpa-onnx/csrc/offline-stream.cc b/sherpa-onnx/csrc/offline-stream.cc index c7c1dc0c2..31f4a5748 100644 --- a/sherpa-onnx/csrc/offline-stream.cc +++ b/sherpa-onnx/csrc/offline-stream.cc @@ -306,8 +306,7 @@ std::string OfflineRecognitionResult::AsJsonString() const { os << "{"; os << "\"text\"" << ": "; - os << "\"" << text << "\"" - << ", "; + os << std::quoted(text) << ", "; os << "\"" << "timestamps" @@ -339,7 +338,7 @@ std::string OfflineRecognitionResult::AsJsonString() const { << "\""; os.flags(oldFlags); } else { - os << sep << "\"" << t << "\""; + os << sep << std::quoted(t); } sep = ", "; } diff --git a/sherpa-onnx/csrc/online-recognizer.cc b/sherpa-onnx/csrc/online-recognizer.cc index 599a0553d..c6b9399d8 100644 --- a/sherpa-onnx/csrc/online-recognizer.cc +++ b/sherpa-onnx/csrc/online-recognizer.cc @@ -44,7 +44,7 @@ std::string VecToString(const std::vector &vec, oss << "["; std::string sep = ""; for (const auto &item : vec) { - oss << sep << "\"" << item << "\""; + oss << sep << std::quoted(item); sep = ", "; } oss << "]"; @@ -54,9 +54,7 @@ std::string VecToString(const std::vector &vec, std::string OnlineRecognizerResult::AsJsonString() const { std::ostringstream os; os << "{ "; - os << "\"text\": " - << "\"" << text << "\"" - << ", "; + os << "\"text\": " << std::quoted(text) << ", "; os << "\"tokens\": " << VecToString(tokens) << ", "; os << "\"timestamps\": " << VecToString(timestamps, 2) << ", "; os << "\"ys_probs\": " << VecToString(ys_probs, 6) << ", ";