Skip to content

Commit

Permalink
Add vad clear api for better performance (k2-fsa#366)
Browse files Browse the repository at this point in the history
* Add vad clear api for better performance

* rename to make naming consistent and remove macro

* Fix linker error

* Fix Vad.kt
  • Loading branch information
yujinqiu authored Oct 16, 2023
1 parent 0682dd1 commit 400e406
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ class MainActivity : AppCompatActivity() {
val samples = FloatArray(ret) { buffer[it] / 32768.0f }

vad.acceptWaveform(samples)
while(!vad.empty()) {vad.pop();}

val isSpeechDetected = vad.isSpeechDetected()
vad.clear()

runOnUiThread {
onVad(isSpeechDetected)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class Vad(
// [start: Int, samples: FloatArray]
fun front() = front(ptr)

fun clear() = clear(ptr)

fun isSpeechDetected(): Boolean = isSpeechDetected(ptr)

fun reset() = reset(ptr)
Expand All @@ -64,6 +66,7 @@ class Vad(
private external fun acceptWaveform(ptr: Long, samples: FloatArray)
private external fun empty(ptr: Long): Boolean
private external fun pop(ptr: Long)
private external fun clear(ptr: Long)
private external fun front(ptr: Long): Array<Any>
private external fun isSpeechDetected(ptr: Long): Boolean
private external fun reset(ptr: Long)
Expand Down
9 changes: 7 additions & 2 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,12 +493,17 @@ int32_t SherpaOnnxVoiceActivityDetectorDetected(
return p->impl->IsSpeechDetected();
}

SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
void SherpaOnnxVoiceActivityDetectorPop(
SherpaOnnxVoiceActivityDetector *p) {
p->impl->Pop();
}

SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
void SherpaOnnxVoiceActivityDetectorClear(
SherpaOnnxVoiceActivityDetector *p) {
p->impl->Clear();
}

const SherpaOnnxSpeechSegment *
SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) {
const sherpa_onnx::SpeechSegment &segment = p->impl->Front();

Expand Down
4 changes: 4 additions & 0 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,10 @@ SherpaOnnxVoiceActivityDetectorDetected(SherpaOnnxVoiceActivityDetector *p);
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
SherpaOnnxVoiceActivityDetector *p);

// Clear current speech segments.
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorClear(
SherpaOnnxVoiceActivityDetector *p);

// Return the first speech segment.
// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
// pointer to avoid memory leak.
Expand Down
4 changes: 4 additions & 0 deletions sherpa-onnx/csrc/voice-activity-detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class VoiceActivityDetector::Impl {

void Pop() { segments_.pop(); }

void Clear() { std::queue<SpeechSegment>().swap(segments_); }

const SpeechSegment &Front() const { return segments_.front(); }

void Reset() {
Expand Down Expand Up @@ -121,6 +123,8 @@ bool VoiceActivityDetector::Empty() const { return impl_->Empty(); }

void VoiceActivityDetector::Pop() { impl_->Pop(); }

void VoiceActivityDetector::Clear() { impl_->Clear(); }

const SpeechSegment &VoiceActivityDetector::Front() const {
return impl_->Front();
}
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/csrc/voice-activity-detector.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class VoiceActivityDetector {
void AcceptWaveform(const float *samples, int32_t n);
bool Empty() const;
void Pop();
void Clear();
const SpeechSegment &Front() const;

bool IsSpeechDetected() const;
Expand Down
10 changes: 10 additions & 0 deletions sherpa-onnx/jni/jni.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ class SherpaOnnxVad {

void Pop() { vad_.Pop(); }

void Clear() { vad_.Clear();}

const SpeechSegment &Front() const { return vad_.Front(); }

bool IsSpeechDetected() const { return vad_.IsSpeechDetected(); }
Expand Down Expand Up @@ -556,6 +558,14 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_pop(JNIEnv *env,
model->Pop();
}

SHERPA_ONNX_EXTERN_C
JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_clear(JNIEnv *env,
jobject /*obj*/,
jlong ptr) {
auto model = reinterpret_cast<sherpa_onnx::SherpaOnnxVad *>(ptr);
model->Clear();
}

// see
// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
static jobject NewInteger(JNIEnv *env, int32_t value) {
Expand Down
6 changes: 5 additions & 1 deletion swift-api-examples/SherpaOnnx.swift
Original file line number Diff line number Diff line change
Expand Up @@ -551,14 +551,18 @@ class SherpaOnnxVoiceActivityDetectorWrapper {
return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1
}

func isDetected() -> Bool {
func isSpeechDetected() -> Bool {
return SherpaOnnxVoiceActivityDetectorDetected(vad) == 1
}

func pop() {
SherpaOnnxVoiceActivityDetectorPop(vad)
}

func clear() {
SherpaOnnxVoiceActivityDetectorClear(vad)
}

func front() -> SherpaOnnxSpeechSegmentWrapper {
let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
return SherpaOnnxSpeechSegmentWrapper(p: p)
Expand Down
35 changes: 17 additions & 18 deletions swift-api-examples/generate-subtitles.swift
Original file line number Diff line number Diff line change
Expand Up @@ -174,32 +174,31 @@ func run() {

var segments: [SpeechSegment] = []

while array.count > windowSize {
// todo(fangjun): avoid extra copies here
vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
array = [Float](array[windowSize..<array.count])

while !vad.isEmpty() {
let s = vad.front()
vad.pop()
let result = recognizer.decode(samples: s.samples)
for offset in stride(from: 0, to: array.count, by: windowSize) {
let end = min(offset + windowSize, array.count)
vad.acceptWaveform(samples: [Float](array[offset ..< end]))
}

segments.append(
SpeechSegment(
start: Float(s.start) / Float(sampleRate),
duration: Float(s.samples.count) / Float(sampleRate),
text: result.text))
var index: Int = 0
while !vad.isEmpty() {
let s = vad.front()
vad.pop()
let result = recognizer.decode(samples: s.samples)

print(segments.last!)
segments.append(
SpeechSegment(
start: Float(s.start) / Float(sampleRate),
duration: Float(s.samples.count) / Float(sampleRate),
text: result.text))

}
print(segments.last!)
}

let srt = zip(segments.indices, segments).map { (index, element) in
let srt: String = zip(segments.indices, segments).map { (index, element) in
return "\(index+1)\n\(element)"
}.joined(separator: "\n\n")

let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
let srtFilename: String = filePath.stringByDeletingPathExtension + ".srt"
do {
try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
} catch {
Expand Down

0 comments on commit 400e406

Please sign in to comment.