Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vad clear api for better performance #366

Merged
merged 4 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ class MainActivity : AppCompatActivity() {
val samples = FloatArray(ret) { buffer[it] / 32768.0f }

vad.acceptWaveform(samples)
while(!vad.empty()) {vad.pop();}

val isSpeechDetected = vad.isSpeechDetected()
vad.clear()

runOnUiThread {
onVad(isSpeechDetected)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class Vad(
// [start: Int, samples: FloatArray]
fun front() = front(ptr)

fun clear() = clear(ptr)

fun isSpeechDetected(): Boolean = isSpeechDetected(ptr)

fun reset() = reset(ptr)
Expand All @@ -64,6 +66,7 @@ class Vad(
private external fun acceptWaveform(ptr: Long, samples: FloatArray)
private external fun empty(ptr: Long): Boolean
private external fun pop(ptr: Long)
private external fun clear(ptr: Long)
private external fun front(ptr: Long): Array<Any>
private external fun isSpeechDetected(ptr: Long): Boolean
private external fun reset(ptr: Long)
Expand Down
9 changes: 7 additions & 2 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,12 +493,17 @@ int32_t SherpaOnnxVoiceActivityDetectorDetected(
return p->impl->IsSpeechDetected();
}

SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
void SherpaOnnxVoiceActivityDetectorPop(
SherpaOnnxVoiceActivityDetector *p) {
p->impl->Pop();
}

SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
void SherpaOnnxVoiceActivityDetectorClear(
SherpaOnnxVoiceActivityDetector *p) {
p->impl->Clear();
}

const SherpaOnnxSpeechSegment *
SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) {
const sherpa_onnx::SpeechSegment &segment = p->impl->Front();

Expand Down
4 changes: 4 additions & 0 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,10 @@ SherpaOnnxVoiceActivityDetectorDetected(SherpaOnnxVoiceActivityDetector *p);
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
SherpaOnnxVoiceActivityDetector *p);

// Clear current speech segments.
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorClear(
SherpaOnnxVoiceActivityDetector *p);

// Return the first speech segment.
// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
// pointer to avoid memory leak.
Expand Down
4 changes: 4 additions & 0 deletions sherpa-onnx/csrc/voice-activity-detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class VoiceActivityDetector::Impl {

void Pop() { segments_.pop(); }

void Clear() { std::queue<SpeechSegment>().swap(segments_); }

const SpeechSegment &Front() const { return segments_.front(); }

void Reset() {
Expand Down Expand Up @@ -121,6 +123,8 @@ bool VoiceActivityDetector::Empty() const { return impl_->Empty(); }

void VoiceActivityDetector::Pop() { impl_->Pop(); }

void VoiceActivityDetector::Clear() { impl_->Clear(); }

const SpeechSegment &VoiceActivityDetector::Front() const {
return impl_->Front();
}
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/csrc/voice-activity-detector.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class VoiceActivityDetector {
void AcceptWaveform(const float *samples, int32_t n);
bool Empty() const;
void Pop();
void Clear();
const SpeechSegment &Front() const;

bool IsSpeechDetected() const;
Expand Down
10 changes: 10 additions & 0 deletions sherpa-onnx/jni/jni.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ class SherpaOnnxVad {

void Pop() { vad_.Pop(); }

void Clear() { vad_.Clear();}

const SpeechSegment &Front() const { return vad_.Front(); }

bool IsSpeechDetected() const { return vad_.IsSpeechDetected(); }
Expand Down Expand Up @@ -556,6 +558,14 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_pop(JNIEnv *env,
model->Pop();
}

SHERPA_ONNX_EXTERN_C
JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_clear(JNIEnv *env,
jobject /*obj*/,
jlong ptr) {
auto model = reinterpret_cast<sherpa_onnx::SherpaOnnxVad *>(ptr);
model->Clear();
}

// see
// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
static jobject NewInteger(JNIEnv *env, int32_t value) {
Expand Down
6 changes: 5 additions & 1 deletion swift-api-examples/SherpaOnnx.swift
Original file line number Diff line number Diff line change
Expand Up @@ -551,14 +551,18 @@ class SherpaOnnxVoiceActivityDetectorWrapper {
return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1
}

func isDetected() -> Bool {
func isSpeechDetected() -> Bool {
return SherpaOnnxVoiceActivityDetectorDetected(vad) == 1
}

func pop() {
SherpaOnnxVoiceActivityDetectorPop(vad)
}

func clear() {
SherpaOnnxVoiceActivityDetectorClear(vad)
}

func front() -> SherpaOnnxSpeechSegmentWrapper {
let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
return SherpaOnnxSpeechSegmentWrapper(p: p)
Expand Down
35 changes: 17 additions & 18 deletions swift-api-examples/generate-subtitles.swift
Original file line number Diff line number Diff line change
Expand Up @@ -174,32 +174,31 @@ func run() {

var segments: [SpeechSegment] = []

while array.count > windowSize {
// todo(fangjun): avoid extra copies here
vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
array = [Float](array[windowSize..<array.count])

while !vad.isEmpty() {
let s = vad.front()
vad.pop()
let result = recognizer.decode(samples: s.samples)
for offset in stride(from: 0, to: array.count, by: windowSize) {
let end = min(offset + windowSize, array.count)
vad.acceptWaveform(samples: [Float](array[offset ..< end]))
}

segments.append(
SpeechSegment(
start: Float(s.start) / Float(sampleRate),
duration: Float(s.samples.count) / Float(sampleRate),
text: result.text))
var index: Int = 0
while !vad.isEmpty() {
let s = vad.front()
vad.pop()
let result = recognizer.decode(samples: s.samples)

print(segments.last!)
segments.append(
SpeechSegment(
start: Float(s.start) / Float(sampleRate),
duration: Float(s.samples.count) / Float(sampleRate),
text: result.text))

}
print(segments.last!)
}

let srt = zip(segments.indices, segments).map { (index, element) in
let srt: String = zip(segments.indices, segments).map { (index, element) in
return "\(index+1)\n\(element)"
}.joined(separator: "\n\n")

let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
let srtFilename: String = filePath.stringByDeletingPathExtension + ".srt"
do {
try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
} catch {
Expand Down
Loading