Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vad patch #1369

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions sherpa-onnx/csrc/silero-vad-model-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
po->Register(
"silero-vad-max-speech-duration", &max_speech_duration,
"In seconds. If a speech segment is longer than this value, then we "
"increase the threshold to 0.9. After finishing detecting the segment, "
"the threshold value is reset to its original value.");
"cut a segment.");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please don't remove it.


po->Register(
"silero-vad-window-size", &window_size,
Expand Down Expand Up @@ -102,12 +101,12 @@ bool SileroVadModelConfig::Validate() const {
std::string SileroVadModelConfig::ToString() const {
std::ostringstream os;

os << "SileroVadModelConfig(";
os << "SilerVadModelConfig(";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please don't change it.

os << "model=\"" << model << "\", ";
os << "threshold=" << threshold << ", ";
os << "min_silence_duration=" << min_silence_duration << ", ";
os << "min_speech_duration=" << min_speech_duration << ", ";
os << "max_speech_duration=" << max_speech_duration << ", ";
os << "max_speech_duration=" << max_speech_duration << ", ";
os << "window_size=" << window_size << ")";

return os.str();
Expand Down
5 changes: 1 addition & 4 deletions sherpa-onnx/csrc/silero-vad-model-config.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ struct SileroVadModelConfig {
// 256, 512, 768 samples for 800 Hz
int32_t window_size = 512; // in samples

// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,
// the threshold value is reset to its original value.
float max_speech_duration = 20; // in seconds
float max_speech_duration = 20; // in seconds
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please don't remove the comments.


SileroVadModelConfig() = default;

Expand Down
66 changes: 53 additions & 13 deletions sherpa-onnx/csrc/silero-vad-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/session.h"
#include "silero-vad-model.h"

namespace sherpa_onnx {

Expand All @@ -32,9 +33,13 @@ class SileroVadModel::Impl {
}

min_silence_samples_ =
sample_rate_ * config_.silero_vad.min_silence_duration;
(int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration);

min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
min_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration);

max_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration);
Comment on lines 35 to +42
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to make such changes?

}

#if __ANDROID_API__ >= 9
Expand All @@ -54,9 +59,13 @@ class SileroVadModel::Impl {
}

min_silence_samples_ =
sample_rate_ * config_.silero_vad.min_silence_duration;
(int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration);

min_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration);

min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
max_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration);
}
#endif

Expand Down Expand Up @@ -155,14 +164,34 @@ class SileroVadModel::Impl {

int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }

int32_t MaxSpeechDurationSamples() const { return max_speech_samples_; }

float Threshold() { return config_.silero_vad.threshold; }

void SetMinSilenceDuration(float s) {
min_silence_samples_ = sample_rate_ * s;
min_silence_samples_ = (int32_t)(sample_rate_ * s);
}

void SetMinSpeechDuration(float s) {
min_speech_samples_ = (int32_t)(sample_rate_ * s);
}

void SetMaxSpeechDuration(float s) {
max_speech_samples_ = (int32_t)(sample_rate_ * s);
}

void SetThreshold(float threshold) {
config_.silero_vad.threshold = threshold;
}

float Run(const float *samples, int32_t n) {
if (is_v5_) {
return RunV5(samples, n);
} else {
return RunV4(samples, n);
}
}

private:
void Init(void *model_data, size_t model_data_length) {
sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
Expand Down Expand Up @@ -335,14 +364,6 @@ class SileroVadModel::Impl {
}
}

float Run(const float *samples, int32_t n) {
if (is_v5_) {
return RunV5(samples, n);
} else {
return RunV4(samples, n);
}
}

float RunV5(const float *samples, int32_t n) {
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
Expand Down Expand Up @@ -418,6 +439,7 @@ class SileroVadModel::Impl {
int64_t sample_rate_;
int32_t min_silence_samples_;
int32_t min_speech_samples_;
int32_t max_speech_samples_;

bool triggered_ = false;
int32_t current_sample_ = 0;
Expand Down Expand Up @@ -457,12 +479,30 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const {
return impl_->MinSpeechDurationSamples();
}

int32_t SileroVadModel::MaxSpeechDurationSamples() {
return impl_->MaxSpeechDurationSamples();
}

float SileroVadModel::Threshold() { return impl_->Threshold(); }

void SileroVadModel::SetMinSilenceDuration(float s) {
impl_->SetMinSilenceDuration(s);
}

void SileroVadModel::SetMinSpeechDuration(float s) {
impl_->SetMinSpeechDuration(s);
}

void SileroVadModel::SetThreshold(float threshold) {
impl_->SetThreshold(threshold);
}

void SileroVadModel::SetMaxSpeechDuration(float s) {
impl_->SetMaxSpeechDuration(s);
}

float SileroVadModel::Run(const float *samples, int32_t n) {
return impl_->Run(samples, n);
}

} // namespace sherpa_onnx
8 changes: 7 additions & 1 deletion sherpa-onnx/csrc/silero-vad-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class SileroVadModel : public VadModel {
*/
bool IsSpeech(const float *samples, int32_t n) override;

float Run(const float *samples, int32_t n);

// For silero vad V4, it is WindowShift().
// For silero vad V5, it is WindowShift()+64 for 16kHz and
// WindowShift()+32 for 8kHz
Expand All @@ -47,9 +49,13 @@ class SileroVadModel : public VadModel {

int32_t MinSilenceDurationSamples() const override;
int32_t MinSpeechDurationSamples() const override;
int32_t MaxSpeechDurationSamples();
float Threshold();

void SetMinSilenceDuration(float s) override;
void SetThreshold(float threshold) override;
void SetMinSpeechDuration(float s);
void SetMaxSpeechDuration(float s);
void SetThreshold(float threshold) override;

private:
class Impl;
Expand Down
Loading