From c182a14853d0d6bbc3cda36b85c102f3cd1b6998 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 1 Jul 2024 10:37:29 -0400 Subject: [PATCH 1/2] chore: Update speaker_id to uint32_t and add speed parameter for generating audio from text --- data/locale/en-US.ini | 17 +++++++++-------- src/input-thread.cpp | 19 +++++++++++++------ src/sherpa-tts/sherpa-tts.cpp | 33 ++++++++++++++++++++++++++------- src/sherpa-tts/sherpa-tts.h | 5 ++++- src/squawk-source-data.h | 3 ++- src/squawk-source.cpp | 15 ++++++++++----- 6 files changed, 64 insertions(+), 28 deletions(-) diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index e219c8f..5f2c1ac 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -1,8 +1,9 @@ -none_no_input=No input -Phonetic_Transcription=Phonetic Transcription -File=File -Text=Text -Generate_Audio=Generate Audio -Speaker_ID=Speaker ID -Model=Model -Delete_Cached_Models=Delete Cached Models +none_no_input="No input" +Phonetic_Transcription="Phonetic Transcription" +File="File" +Text="Text" +Generate_Audio="Generate Audio" +Speaker_ID="Speaker ID" +Model="Model" +Delete_Cached_Models="Delete Cached Models" +Speed="Speed" diff --git a/src/input-thread.cpp b/src/input-thread.cpp index eec19f6..4b5d6f3 100644 --- a/src/input-thread.cpp +++ b/src/input-thread.cpp @@ -18,6 +18,8 @@ void InputThread::run() while (running) { obs_log(LOG_DEBUG, "Input thread checking for changes"); + std::string new_content_for_generation; + // Monitor files for changes if (!file.empty()) { // Check if file has changed @@ -37,9 +39,7 @@ void InputThread::run() } if (fileContents != lastFileValue) { // Invoke speech generation if it has changed - if (speechGenerationCallback) { - speechGenerationCallback(fileContents); - } + new_content_for_generation = fileContents; lastFileValue = fileContents; } } @@ -58,9 +58,7 @@ void InputThread::run() obs_data_release(sourceSettings); if (text && lastOBSTextSourceValue != text) { // Invoke speech generation if it has changed - if (speechGenerationCallback) { - speechGenerationCallback(text); - } + new_content_for_generation = text; lastOBSTextSourceValue = text; } } @@ -68,6 +66,15 @@ void InputThread::run() } } + if (!new_content_for_generation.empty() && speechGenerationCallback) { + std::thread generationThread([this, new_content_for_generation]() { + obs_log(LOG_DEBUG, "Generating speech from input: %s", + new_content_for_generation.c_str()); + speechGenerationCallback(new_content_for_generation); + }); + generationThread.detach(); + } + // Sleep for a certain interval before checking again std::this_thread::sleep_for(std::chrono::milliseconds(interval)); } diff --git a/src/sherpa-tts/sherpa-tts.cpp b/src/sherpa-tts/sherpa-tts.cpp index f666e84..d5e37b8 100644 --- a/src/sherpa-tts/sherpa-tts.cpp +++ b/src/sherpa-tts/sherpa-tts.cpp @@ -6,19 +6,36 @@ #include -void generate_audio_from_text(sherpa_tts_context &ctx, const char *text, int speaker_id) +void generate_audio_from_text(sherpa_tts_context &ctx, const std::string &text, uint32_t speaker_id, + float speed) { - if (ctx.tts == nullptr) { + if (ctx.tts == nullptr || !ctx.initialized || text.empty() || + ctx.audio_callback == nullptr || speed <= 0.0f) { return; } - const SherpaOnnxGeneratedAudio *audio = - SherpaOnnxOfflineTtsGenerate(ctx.tts, text, speaker_id, 1.0); + if (ctx.num_speakers == 0) { + obs_log(LOG_WARNING, "No speakers found in the model. Assuming speaker id 0."); + speaker_id = 0; + } else if (speaker_id < 0 || speaker_id >= ctx.num_speakers) { + obs_log(LOG_WARNING, "Speaker id %d is out of range (0 -> %d), using speaker id 0", + speaker_id, ctx.num_speakers - 1); + speaker_id = 0; + } + + try { + obs_log(LOG_DEBUG, "Generating audio from text: %s, speaker_id: %d, speed: %f", + text.c_str(), speaker_id, speed); + const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerate(ctx.tts, text.c_str(), speaker_id, speed); - // Call the audio callback function with the generated audio samples - ctx.audio_callback(ctx.callback_data, audio->samples, audio->n, audio->sample_rate); + // Call the audio callback function with the generated audio samples + ctx.audio_callback(ctx.callback_data, audio->samples, audio->n, audio->sample_rate); - SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); + } catch (const std::exception &e) { + obs_log(LOG_ERROR, "Error generating audio from text: %s", e.what()); + } } void init_sherpa_tts_context(sherpa_tts_context &context, @@ -66,6 +83,8 @@ void init_sherpa_tts_context(sherpa_tts_context &context, context.tts = SherpaOnnxCreateOfflineTts(&config); context.audio_callback = audio_callback; context.callback_data = data; + context.num_speakers = std::max(1, SherpaOnnxOfflineTtsNumSpeakers(context.tts)); + context.initialized = true; } void destroy_sherpa_tts_context(sherpa_tts_context &context) diff --git a/src/sherpa-tts/sherpa-tts.h b/src/sherpa-tts/sherpa-tts.h index 13bf945..441b655 100644 --- a/src/sherpa-tts/sherpa-tts.h +++ b/src/sherpa-tts/sherpa-tts.h @@ -10,6 +10,8 @@ struct sherpa_tts_context { std::string model_name; void *callback_data = nullptr; void (*audio_callback)(void *data, const float *samples, int num_samples, int sample_rate); + uint32_t num_speakers = 0; + bool initialized = false; }; void init_sherpa_tts_context(sherpa_tts_context &context, @@ -18,6 +20,7 @@ void init_sherpa_tts_context(sherpa_tts_context &context, void *data); void destroy_sherpa_tts_context(sherpa_tts_context &context); -void generate_audio_from_text(sherpa_tts_context &ctx, const char *text, int speaker_id); +void generate_audio_from_text(sherpa_tts_context &ctx, const std::string &text, uint32_t speaker_id, + float speed); #endif diff --git a/src/squawk-source-data.h b/src/squawk-source-data.h index ca62ca1..a5ef568 100644 --- a/src/squawk-source-data.h +++ b/src/squawk-source-data.h @@ -14,7 +14,8 @@ struct squawk_source_data { sherpa_tts_context tts_context; std::unique_ptr inputThread; - int speaker_id; + uint32_t speaker_id; + float speed; bool phonetic_transcription; squawk_source_data() { context = nullptr; } diff --git a/src/squawk-source.cpp b/src/squawk-source.cpp index 27b9a4e..5a60a4f 100644 --- a/src/squawk-source.cpp +++ b/src/squawk-source.cpp @@ -47,8 +47,8 @@ void *squawk_source_create(obs_data_t *settings, obs_source_t *source) if (squawk_data->phonetic_transcription) { transformed_text = phonetic_transcription(text); } - generate_audio_from_text(squawk_data->tts_context, transformed_text.c_str(), - squawk_data->speaker_id); + generate_audio_from_text(squawk_data->tts_context, transformed_text, + squawk_data->speaker_id, squawk_data->speed); }); squawk_data->inputThread->start(); @@ -70,6 +70,7 @@ void squawk_source_destroy(void *data) void squawk_source_defaults(obs_data_t *settings) { obs_data_set_default_int(settings, "speaker_id", 0); + obs_data_set_default_double(settings, "speed", 1.0); obs_data_set_default_string(settings, "text", "Hello, World!"); obs_data_set_default_string(settings, "model", "vits-coqui-en-vctk"); obs_data_set_default_string(settings, "input_source", "none"); @@ -143,6 +144,9 @@ obs_properties_t *squawk_source_properties(void *data) // add speaker id property obs_properties_add_int(ppts, "speaker_id", MT_("Speaker_ID"), 0, 100, 1); + // add a speed slider between 0.1 and 2.5 + obs_properties_add_float_slider(ppts, "speed", MT_("Speed"), 0.1, 2.5, 0.1); + // add input source selection dropdown property obs_property_t *input_source = obs_properties_add_list( ppts, "input_source", "Input Source", OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); @@ -179,8 +183,8 @@ obs_properties_t *squawk_source_properties(void *data) original_text.c_str(), text.c_str()); } - generate_audio_from_text(squawk_data_->tts_context, text.c_str(), - speaker_id); + generate_audio_from_text(squawk_data_->tts_context, text, speaker_id, + squawk_data_->speed); return true; }); @@ -221,7 +225,8 @@ void squawk_source_update(void *data, obs_data_t *settings) squawk_source_data *squawk_data = (squawk_source_data *)data; - squawk_data->speaker_id = (int)obs_data_get_int(settings, "speaker_id"); + squawk_data->speaker_id = (uint32_t)obs_data_get_int(settings, "speaker_id"); + squawk_data->speed = (float)obs_data_get_double(settings, "speed"); squawk_data->phonetic_transcription = obs_data_get_bool(settings, "phonetic_transcription"); std::string source = obs_data_get_string(settings, "input_source"); From 6201b9c94a8d8d229b223b8c66a1ccda574e4284 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 1 Jul 2024 10:41:33 -0400 Subject: [PATCH 2/2] Fix out-of-range speaker id warning in generate_audio_from_text function --- src/sherpa-tts/sherpa-tts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sherpa-tts/sherpa-tts.cpp b/src/sherpa-tts/sherpa-tts.cpp index d5e37b8..bd408a4 100644 --- a/src/sherpa-tts/sherpa-tts.cpp +++ b/src/sherpa-tts/sherpa-tts.cpp @@ -17,7 +17,7 @@ void generate_audio_from_text(sherpa_tts_context &ctx, const std::string &text, if (ctx.num_speakers == 0) { obs_log(LOG_WARNING, "No speakers found in the model. Assuming speaker id 0."); speaker_id = 0; - } else if (speaker_id < 0 || speaker_id >= ctx.num_speakers) { + } else if (speaker_id >= ctx.num_speakers) { obs_log(LOG_WARNING, "Speaker id %d is out of range (0 -> %d), using speaker id 0", speaker_id, ctx.num_speakers - 1); speaker_id = 0;