-
Notifications
You must be signed in to change notification settings - Fork 477
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add vad with non-streaming ASR examples for Dart API (#1180)
- Loading branch information
1 parent
d279c8d
commit 69b6b47
Showing
21 changed files
with
924 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# https://dart.dev/guides/libraries/private-files | ||
# Created by `dart pub` | ||
.dart_tool/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Introduction | ||
|
||
This folder contains examples for non-streaming ASR + voice activity detection | ||
with Dart API. | ||
|
||
| File | Description| | ||
|------|------------| | ||
|[./bin/paraformer.dart](./bin/paraformer.dart)| Use a Paraformer model for speech recognition. See [./run-paraformer.sh](./run-paraformer.sh)| | ||
|[./bin/sense-voice.dart](./bin/sense-voice.dart)| Use a SenseVoice Ctc model for speech recognition. See [./run-sense-voice-zh.sh](./run-sense-voice-zh.sh) and [./run-sense-voice-en.sh](./run-sense-voice-en.sh)| | ||
|[./bin/telespeech-ctc.dart](./bin/telespeech-ctc.dart)| Use a TeleSpeech CTC model for speech recognition. See [./run-telespeech-ctc.sh](./run-telespeech-ctc.sh)| | ||
|[./bin/whisper.dart](./bin/whisper.dart)| Use a Whisper model for speech recognition. See [./run-whisper.sh](./run-whisper.sh)| | ||
|[./bin/zipformer-transducer.dart](./bin/zipformer-transducer.dart)| Use a Zipformer transducer model for speech recognition. See [./run-zipformer-transducer.sh](./run-zipformer-transducer.sh)| | ||
|
30 changes: 30 additions & 0 deletions
30
dart-api-examples/vad-with-non-streaming-asr/analysis_options.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# This file configures the static analysis results for your project (errors, | ||
# warnings, and lints). | ||
# | ||
# This enables the 'recommended' set of lints from `package:lints`. | ||
# This set helps identify many issues that may lead to problems when running | ||
# or consuming Dart code, and enforces writing Dart using a single, idiomatic | ||
# style and format. | ||
# | ||
# If you want a smaller set of lints you can change this to specify | ||
# 'package:lints/core.yaml'. These are just the most critical lints | ||
# (the recommended set includes the core lints). | ||
# The core lints are also what is used by pub.dev for scoring packages. | ||
|
||
include: package:lints/recommended.yaml | ||
|
||
# Uncomment the following section to specify additional rules. | ||
|
||
# linter: | ||
# rules: | ||
# - camel_case_types | ||
|
||
# analyzer: | ||
# exclude: | ||
# - path/to/excluded/files/** | ||
|
||
# For more information about the core and recommended set of lints, see | ||
# https://dart.dev/go/core-lints | ||
|
||
# For additional information about configuring this file, see | ||
# https://dart.dev/guides/language/analysis-options |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../vad/bin/init.dart |
123 changes: 123 additions & 0 deletions
123
dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
// Copyright (c) 2024 Xiaomi Corporation | ||
import 'dart:io'; | ||
import 'dart:typed_data'; | ||
|
||
import 'package:args/args.dart'; | ||
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; | ||
|
||
import './init.dart'; | ||
|
||
void main(List<String> arguments) async { | ||
await initSherpaOnnx(); | ||
|
||
final parser = ArgParser() | ||
..addOption('silero-vad', help: 'Path to silero_vad.onnx') | ||
..addOption('model', help: 'Path to the paraformer model') | ||
..addOption('tokens', help: 'Path to tokens.txt') | ||
..addOption('input-wav', help: 'Path to input.wav to transcribe'); | ||
|
||
final res = parser.parse(arguments); | ||
if (res['silero-vad'] == null || | ||
res['model'] == null || | ||
res['tokens'] == null || | ||
res['input-wav'] == null) { | ||
print(parser.usage); | ||
exit(1); | ||
} | ||
|
||
// create VAD | ||
final sileroVad = res['silero-vad'] as String; | ||
|
||
final sileroVadConfig = sherpa_onnx.SileroVadModelConfig( | ||
model: sileroVad, | ||
minSilenceDuration: 0.25, | ||
minSpeechDuration: 0.5, | ||
); | ||
|
||
final vadConfig = sherpa_onnx.VadModelConfig( | ||
sileroVad: sileroVadConfig, | ||
numThreads: 1, | ||
debug: true, | ||
); | ||
|
||
final vad = sherpa_onnx.VoiceActivityDetector( | ||
config: vadConfig, bufferSizeInSeconds: 10); | ||
|
||
// create paraformer recognizer | ||
final model = res['model'] as String; | ||
final tokens = res['tokens'] as String; | ||
final inputWav = res['input-wav'] as String; | ||
|
||
final paraformer = sherpa_onnx.OfflineParaformerModelConfig( | ||
model: model, | ||
); | ||
|
||
final modelConfig = sherpa_onnx.OfflineModelConfig( | ||
paraformer: paraformer, | ||
tokens: tokens, | ||
debug: true, | ||
numThreads: 1, | ||
modelType: 'paraformer', | ||
); | ||
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); | ||
final recognizer = sherpa_onnx.OfflineRecognizer(config); | ||
|
||
final waveData = sherpa_onnx.readWave(inputWav); | ||
if (waveData.sampleRate != 16000) { | ||
print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}'); | ||
exit(1); | ||
} | ||
|
||
int numSamples = waveData.samples.length; | ||
int numIter = numSamples ~/ vadConfig.sileroVad.windowSize; | ||
|
||
for (int i = 0; i != numIter; ++i) { | ||
int start = i * vadConfig.sileroVad.windowSize; | ||
vad.acceptWaveform(Float32List.sublistView( | ||
waveData.samples, start, start + vadConfig.sileroVad.windowSize)); | ||
|
||
if (vad.isDetected()) { | ||
while (!vad.isEmpty()) { | ||
final samples = vad.front().samples; | ||
final startTime = vad.front().start.toDouble() / waveData.sampleRate; | ||
final endTime = | ||
startTime + samples.length.toDouble() / waveData.sampleRate; | ||
|
||
final stream = recognizer.createStream(); | ||
stream.acceptWaveform( | ||
samples: samples, sampleRate: waveData.sampleRate); | ||
recognizer.decode(stream); | ||
|
||
final result = recognizer.getResult(stream); | ||
stream.free(); | ||
print( | ||
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); | ||
|
||
vad.pop(); | ||
} | ||
} | ||
} | ||
|
||
vad.flush(); | ||
|
||
while (!vad.isEmpty()) { | ||
final samples = vad.front().samples; | ||
final startTime = vad.front().start.toDouble() / waveData.sampleRate; | ||
final endTime = startTime + samples.length.toDouble() / waveData.sampleRate; | ||
|
||
final stream = recognizer.createStream(); | ||
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); | ||
recognizer.decode(stream); | ||
|
||
final result = recognizer.getResult(stream); | ||
stream.free(); | ||
print( | ||
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); | ||
|
||
vad.pop(); | ||
} | ||
|
||
vad.free(); | ||
|
||
recognizer.free(); | ||
} |
128 changes: 128 additions & 0 deletions
128
dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
// Copyright (c) 2024 Xiaomi Corporation | ||
import 'dart:io'; | ||
import 'dart:typed_data'; | ||
|
||
import 'package:args/args.dart'; | ||
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; | ||
|
||
import './init.dart'; | ||
|
||
void main(List<String> arguments) async { | ||
await initSherpaOnnx(); | ||
|
||
final parser = ArgParser() | ||
..addOption('silero-vad', help: 'Path to silero_vad.onnx') | ||
..addOption('model', help: 'Path to the SenseVoice model') | ||
..addOption('tokens', help: 'Path to tokens.txt') | ||
..addOption('language', | ||
help: 'auto, zh, en, ja, ko, yue, or leave it empty to use auto', | ||
defaultsTo: '') | ||
..addOption('use-itn', | ||
help: 'true to use inverse text normalization', defaultsTo: 'false') | ||
..addOption('input-wav', help: 'Path to input.wav to transcribe'); | ||
|
||
final res = parser.parse(arguments); | ||
if (res['silero-vad'] == null || | ||
res['model'] == null || | ||
res['tokens'] == null || | ||
res['input-wav'] == null) { | ||
print(parser.usage); | ||
exit(1); | ||
} | ||
|
||
// create VAD | ||
final sileroVad = res['silero-vad'] as String; | ||
|
||
final sileroVadConfig = sherpa_onnx.SileroVadModelConfig( | ||
model: sileroVad, | ||
minSilenceDuration: 0.25, | ||
minSpeechDuration: 0.5, | ||
); | ||
|
||
final vadConfig = sherpa_onnx.VadModelConfig( | ||
sileroVad: sileroVadConfig, | ||
numThreads: 1, | ||
debug: true, | ||
); | ||
|
||
final vad = sherpa_onnx.VoiceActivityDetector( | ||
config: vadConfig, bufferSizeInSeconds: 10); | ||
|
||
// create SenseVoice | ||
final model = res['model'] as String; | ||
final tokens = res['tokens'] as String; | ||
final inputWav = res['input-wav'] as String; | ||
final language = res['language'] as String; | ||
final useItn = (res['use-itn'] as String).toLowerCase() == 'true'; | ||
|
||
final senseVoice = sherpa_onnx.OfflineSenseVoiceModelConfig( | ||
model: model, language: language, useInverseTextNormalization: useItn); | ||
|
||
final modelConfig = sherpa_onnx.OfflineModelConfig( | ||
senseVoice: senseVoice, | ||
tokens: tokens, | ||
debug: true, | ||
numThreads: 1, | ||
); | ||
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); | ||
final recognizer = sherpa_onnx.OfflineRecognizer(config); | ||
|
||
final waveData = sherpa_onnx.readWave(inputWav); | ||
if (waveData.sampleRate != 16000) { | ||
print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}'); | ||
exit(1); | ||
} | ||
|
||
int numSamples = waveData.samples.length; | ||
int numIter = numSamples ~/ vadConfig.sileroVad.windowSize; | ||
|
||
for (int i = 0; i != numIter; ++i) { | ||
int start = i * vadConfig.sileroVad.windowSize; | ||
vad.acceptWaveform(Float32List.sublistView( | ||
waveData.samples, start, start + vadConfig.sileroVad.windowSize)); | ||
|
||
if (vad.isDetected()) { | ||
while (!vad.isEmpty()) { | ||
final samples = vad.front().samples; | ||
final startTime = vad.front().start.toDouble() / waveData.sampleRate; | ||
final endTime = | ||
startTime + samples.length.toDouble() / waveData.sampleRate; | ||
|
||
final stream = recognizer.createStream(); | ||
stream.acceptWaveform( | ||
samples: samples, sampleRate: waveData.sampleRate); | ||
recognizer.decode(stream); | ||
|
||
final result = recognizer.getResult(stream); | ||
stream.free(); | ||
print( | ||
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); | ||
|
||
vad.pop(); | ||
} | ||
} | ||
} | ||
|
||
vad.flush(); | ||
|
||
while (!vad.isEmpty()) { | ||
final samples = vad.front().samples; | ||
final startTime = vad.front().start.toDouble() / waveData.sampleRate; | ||
final endTime = startTime + samples.length.toDouble() / waveData.sampleRate; | ||
|
||
final stream = recognizer.createStream(); | ||
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); | ||
recognizer.decode(stream); | ||
|
||
final result = recognizer.getResult(stream); | ||
stream.free(); | ||
print( | ||
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); | ||
|
||
vad.pop(); | ||
} | ||
|
||
vad.free(); | ||
|
||
recognizer.free(); | ||
} |
Oops, something went wrong.