-
Notifications
You must be signed in to change notification settings - Fork 508
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Java and Kotlin API for sense voice (#1164)
- Loading branch information
1 parent
ac8223b
commit dd300b1
Showing
16 changed files
with
601 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
// Copyright 2024 Xiaomi Corporation | ||
|
||
// This file shows how to use an offline SenseVoice model, | ||
// i.e., non-streaming SenseVoice model, | ||
// to decode files. | ||
import com.k2fsa.sherpa.onnx.*; | ||
|
||
public class NonStreamingDecodeFileSenseVoice { | ||
public static void main(String[] args) { | ||
// please refer to | ||
// https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html | ||
// to download model files | ||
String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
|
||
String waveFilename = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav"; | ||
|
||
WaveReader reader = new WaveReader(waveFilename); | ||
|
||
OfflineSenseVoiceModelConfig senseVoice = | ||
OfflineSenseVoiceModelConfig.builder().setModel(model).build(); | ||
|
||
OfflineModelConfig modelConfig = | ||
OfflineModelConfig.builder() | ||
.setSenseVoice(senseVoice) | ||
.setTokens(tokens) | ||
.setNumThreads(1) | ||
.setDebug(true) | ||
.build(); | ||
|
||
OfflineRecognizerConfig config = | ||
OfflineRecognizerConfig.builder() | ||
.setOfflineModelConfig(modelConfig) | ||
.setDecodingMethod("greedy_search") | ||
.build(); | ||
|
||
OfflineRecognizer recognizer = new OfflineRecognizer(config); | ||
OfflineStream stream = recognizer.createStream(); | ||
stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); | ||
|
||
recognizer.decode(stream); | ||
|
||
String text = recognizer.getResult(stream).getText(); | ||
|
||
System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); | ||
|
||
stream.release(); | ||
recognizer.release(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
142 changes: 142 additions & 0 deletions
142
java-api-examples/VadFromMicWithNonStreamingSenseVoice.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
// Copyright 2024 Xiaomi Corporation | ||
|
||
// This file shows how to use a silero_vad model with a non-streaming | ||
// SenseVoice model for speech recognition. | ||
|
||
import com.k2fsa.sherpa.onnx.*; | ||
import javax.sound.sampled.*; | ||
|
||
public class VadFromMicWithNonStreamingSenseVoice { | ||
private static final int sampleRate = 16000; | ||
private static final int windowSize = 512; | ||
|
||
public static Vad createVad() { | ||
// please download ./silero_vad.onnx from | ||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
String model = "./silero_vad.onnx"; | ||
SileroVadModelConfig sileroVad = | ||
SileroVadModelConfig.builder() | ||
.setModel(model) | ||
.setThreshold(0.5f) | ||
.setMinSilenceDuration(0.25f) | ||
.setMinSpeechDuration(0.5f) | ||
.setWindowSize(windowSize) | ||
.build(); | ||
|
||
VadModelConfig config = | ||
VadModelConfig.builder() | ||
.setSileroVadModelConfig(sileroVad) | ||
.setSampleRate(sampleRate) | ||
.setNumThreads(1) | ||
.setDebug(true) | ||
.setProvider("cpu") | ||
.build(); | ||
|
||
return new Vad(config); | ||
} | ||
|
||
public static OfflineRecognizer createOfflineRecognizer() { | ||
// please refer to | ||
// https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html | ||
// to download model files | ||
String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
|
||
OfflineSenseVoiceModelConfig senseVoice = | ||
OfflineSenseVoiceModelConfig.builder().setModel(model).build(); | ||
|
||
OfflineModelConfig modelConfig = | ||
OfflineModelConfig.builder() | ||
.setSenseVoice(senseVoice) | ||
.setTokens(tokens) | ||
.setNumThreads(1) | ||
.setDebug(true) | ||
.build(); | ||
|
||
OfflineRecognizerConfig config = | ||
OfflineRecognizerConfig.builder() | ||
.setOfflineModelConfig(modelConfig) | ||
.setDecodingMethod("greedy_search") | ||
.build(); | ||
|
||
return new OfflineRecognizer(config); | ||
} | ||
|
||
public static void main(String[] args) { | ||
Vad vad = createVad(); | ||
OfflineRecognizer recognizer = createOfflineRecognizer(); | ||
|
||
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html | ||
// Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian | ||
AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false); | ||
|
||
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int- | ||
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format); | ||
TargetDataLine targetDataLine; | ||
try { | ||
targetDataLine = (TargetDataLine) AudioSystem.getLine(info); | ||
targetDataLine.open(format); | ||
targetDataLine.start(); | ||
} catch (LineUnavailableException e) { | ||
System.out.println("Failed to open target data line: " + e.getMessage()); | ||
vad.release(); | ||
recognizer.release(); | ||
return; | ||
} | ||
|
||
boolean printed = false; | ||
byte[] buffer = new byte[windowSize * 2]; | ||
float[] samples = new float[windowSize]; | ||
|
||
System.out.println("Started. Please speak"); | ||
boolean running = true; | ||
while (targetDataLine.isOpen() && running) { | ||
int n = targetDataLine.read(buffer, 0, buffer.length); | ||
if (n <= 0) { | ||
System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length); | ||
continue; | ||
} | ||
for (int i = 0; i != windowSize; ++i) { | ||
short low = buffer[2 * i]; | ||
short high = buffer[2 * i + 1]; | ||
int s = (high << 8) + low; | ||
samples[i] = (float) s / 32768; | ||
} | ||
|
||
vad.acceptWaveform(samples); | ||
if (vad.isSpeechDetected() && !printed) { | ||
System.out.println("Detected speech"); | ||
printed = true; | ||
} | ||
|
||
if (!vad.isSpeechDetected()) { | ||
printed = false; | ||
} | ||
|
||
while (!vad.empty()) { | ||
SpeechSegment segment = vad.front(); | ||
float startTime = segment.getStart() / (float) sampleRate; | ||
float duration = segment.getSamples().length / (float) sampleRate; | ||
|
||
OfflineStream stream = recognizer.createStream(); | ||
stream.acceptWaveform(segment.getSamples(), sampleRate); | ||
recognizer.decode(stream); | ||
String text = recognizer.getResult(stream).getText(); | ||
stream.release(); | ||
|
||
if (!text.isEmpty()) { | ||
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
} | ||
|
||
if (text.contains("退出程序")) { | ||
running = false; | ||
} | ||
|
||
vad.pop(); | ||
} | ||
} | ||
|
||
vad.release(); | ||
recognizer.release(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
// Copyright 2024 Xiaomi Corporation | ||
|
||
// This file shows how to use a silero_vad model with a non-streaming SenseVoiceModel | ||
// for speech recognition. | ||
|
||
import com.k2fsa.sherpa.onnx.*; | ||
import java.util.Arrays; | ||
|
||
public class VadNonStreamingSenseVoice { | ||
public static Vad createVad() { | ||
// please download ./silero_vad.onnx from | ||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
String model = "./silero_vad.onnx"; | ||
SileroVadModelConfig sileroVad = | ||
SileroVadModelConfig.builder() | ||
.setModel(model) | ||
.setThreshold(0.5f) | ||
.setMinSilenceDuration(0.25f) | ||
.setMinSpeechDuration(0.5f) | ||
.setWindowSize(512) | ||
.build(); | ||
|
||
VadModelConfig config = | ||
VadModelConfig.builder() | ||
.setSileroVadModelConfig(sileroVad) | ||
.setSampleRate(16000) | ||
.setNumThreads(1) | ||
.setDebug(true) | ||
.setProvider("cpu") | ||
.build(); | ||
|
||
return new Vad(config); | ||
} | ||
|
||
public static OfflineRecognizer createOfflineRecognizer() { | ||
// please refer to | ||
// https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html | ||
// to download model files | ||
String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
|
||
OfflineSenseVoiceModelConfig senseVoice = | ||
OfflineSenseVoiceModelConfig.builder().setModel(model).build(); | ||
|
||
OfflineModelConfig modelConfig = | ||
OfflineModelConfig.builder() | ||
.setSenseVoice(senseVoice) | ||
.setTokens(tokens) | ||
.setNumThreads(1) | ||
.setDebug(true) | ||
.build(); | ||
|
||
OfflineRecognizerConfig config = | ||
OfflineRecognizerConfig.builder() | ||
.setOfflineModelConfig(modelConfig) | ||
.setDecodingMethod("greedy_search") | ||
.build(); | ||
|
||
return new OfflineRecognizer(config); | ||
} | ||
|
||
public static void main(String[] args) { | ||
|
||
Vad vad = createVad(); | ||
OfflineRecognizer recognizer = createOfflineRecognizer(); | ||
|
||
// You can download the test file from | ||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
String testWaveFilename = "./lei-jun-test.wav"; | ||
WaveReader reader = new WaveReader(testWaveFilename); | ||
|
||
int numSamples = reader.getSamples().length; | ||
int numIter = numSamples / 512; | ||
|
||
for (int i = 0; i != numIter; ++i) { | ||
int start = i * 512; | ||
int end = start + 512; | ||
float[] samples = Arrays.copyOfRange(reader.getSamples(), start, end); | ||
vad.acceptWaveform(samples); | ||
if (vad.isSpeechDetected()) { | ||
while (!vad.empty()) { | ||
SpeechSegment segment = vad.front(); | ||
float startTime = segment.getStart() / 16000.0f; | ||
float duration = segment.getSamples().length / 16000.0f; | ||
|
||
OfflineStream stream = recognizer.createStream(); | ||
stream.acceptWaveform(segment.getSamples(), 16000); | ||
recognizer.decode(stream); | ||
String text = recognizer.getResult(stream).getText(); | ||
stream.release(); | ||
|
||
if (!text.isEmpty()) { | ||
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
} | ||
|
||
vad.pop(); | ||
} | ||
} | ||
} | ||
|
||
vad.flush(); | ||
while (!vad.empty()) { | ||
SpeechSegment segment = vad.front(); | ||
float startTime = segment.getStart() / 16000.0f; | ||
float duration = segment.getSamples().length / 16000.0f; | ||
|
||
OfflineStream stream = recognizer.createStream(); | ||
stream.acceptWaveform(segment.getSamples(), 16000); | ||
recognizer.decode(stream); | ||
String text = recognizer.getResult(stream).getText(); | ||
stream.release(); | ||
|
||
if (!text.isEmpty()) { | ||
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
} | ||
|
||
vad.pop(); | ||
} | ||
|
||
vad.release(); | ||
recognizer.release(); | ||
} | ||
} |
Oops, something went wrong.