Skip to content

Commit

Permalink
Add Go API for TTS
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Oct 20, 2023
1 parent 3ba9a49 commit df94b10
Show file tree
Hide file tree
Showing 14 changed files with 280 additions and 6 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/test-go.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,42 @@ jobs:
go mod tidy
go build
- name: Test non-streaming TTS (macOS)
shell: bash
run: |
mkdir tts-waves
cd scripts/go/_internal/non-streaming-tts/
ls -lh
go mod tidy
cat go.mod
go build
ls -lh
git lfs install
echo "Test vits-ljs"
git clone https://huggingface.co/csukuangfj/vits-ljs
./run-vits-ljs.sh
rm -rf vits-ljs
echo "Test vits-vctk"
git clone https://huggingface.co/csukuangfj/vits-vctk
./run-vits-vctk.sh
rm -rf vits-vctk
echo "Test vits-zh-aishell3"
git clone https://huggingface.co/csukuangfj/vits-zh-aishell3
./run-vits-zh-aishell3.sh
rm -rf vits-zh-aishell3
cp *.wav ../../../../tts-waves/
- uses: actions/upload-artifact@v3
with:
name: tts-waves
path: tts-waves

- name: Test non-streaming decoding files (macOS)
shell: bash
run: |
Expand Down
2 changes: 1 addition & 1 deletion c-api-examples/offline-tts-c-api.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ int32_t main(int32_t argc, char *argv[]) {
const SherpaOnnxGeneratedAudio *audio =
SherpaOnnxOfflineTtsGenerate(tts, text, sid);

SherpaOnnxDestroyOfflineWriteWave(audio, filename);
SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename);

SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
SherpaOnnxDestroyOfflineTts(tts);
Expand Down
3 changes: 3 additions & 0 deletions go-api-examples/non-streaming-tts/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module non-streaming-tts

go 1.12
57 changes: 57 additions & 0 deletions go-api-examples/non-streaming-tts/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package main

import (
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
flag "github.com/spf13/pflag"
"log"
)

func main() {
log.SetFlags(log.LstdFlags | log.Lmicroseconds)

config := sherpa.OfflineTtsConfig{}
sid := 0
filename := "./generated.wav"

flag.StringVar(&config.Model.Vits.Model, "vits-model", "", "Path to the vits ONNX model")
flag.StringVar(&config.Model.Vits.Lexicon, "vits-lexicon", "", "Path to lexicon.txt")
flag.StringVar(&config.Model.Vits.Tokens, "vits-tokens", "", "Path to tokens.txt")

flag.Float32Var(&config.Model.Vits.NoiseScale, "vits-noise-scale", 0.667, "noise_scale for VITS")
flag.Float32Var(&config.Model.Vits.NoiseScaleW, "vits-noise-scale-w", 0.8, "noise_scale_w for VITS")
flag.Float32Var(&config.Model.Vits.LengthScale, "vits-length-scale", 1.0, "length_scale for VITS. small -> faster in speech speed; large -> slower")

flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing")
flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")

flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models")
flag.StringVar(&filename, "output-filename", "./generated.wav", "Filename to save the generated audio")

flag.Parse()

if len(flag.Args()) != 1 {
log.Fatalf("Please provide the text to generate audios")
}

text := flag.Arg(0)

log.Println("Input text:", text)
log.Println("Speaker ID:", sid)
log.Println("Output filename:", filename)

log.Println("Initializing model (may take several seconds)")
tts := sherpa.NewOfflineTts(&config)
log.Println("Model created!")
defer sherpa.DeleteOfflineTts(tts)

log.Println("Start generating!")

audio := tts.Generate(text, sid)
ok := audio.Save(filename)
if ok != 1 {
log.Fatalf("Failed to write", filename)
}

log.Println("Done!")
}
14 changes: 14 additions & 0 deletions go-api-examples/non-streaming-tts/run-vits-ljs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

# please refer to
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#ljspeech-english-single-speaker
# to download the model before you run this script

./non-streaming-tts \
--vits-model=./vits-ljs/vits-ljs.onnx \
--vits-lexicon=./vits-ljs/lexicon.txt \
--vits-tokens=./vits-ljs/tokens.txt \
--sid=0 \
--debug=1 \
--output-filename=./vits-ljs.wav \
"Liliana, the most beautiful and lovely assistant of our team!"
16 changes: 16 additions & 0 deletions go-api-examples/non-streaming-tts/run-vits-vctk.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

# please refer to
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers
# to download the model before you run this script

for sid in 0 10 108; do
./non-streaming-tts \
--vits-model=./vits-vctk/vits-vctk.onnx \
--vits-lexicon=./vits-vctk/lexicon.txt \
--vits-tokens=./vits-vctk/tokens.txt \
--sid=0 \
--debug=1 \
--output-filename=./kennedy-$sid.wav \
'Ask not what your country can do for you; ask what you can do for your country.'
done
16 changes: 16 additions & 0 deletions go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

# please refer to
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#aishell3-chinese-multi-speaker-174-speakers
# to download the model before you run this script

for sid in 10 33 99; do
./non-streaming-tts \
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
--vits-tokens=./vits-zh-aishell3/tokens.txt \
--sid=10 \
--debug=1 \
--output-filename=./liliana-$sid.wav \
"林美丽最美丽、最漂亮、最可爱!"
done
5 changes: 5 additions & 0 deletions scripts/go/_internal/non-streaming-tts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.wav
vits-ljs
vits-vctk
vits-zh-aishell3
non-streaming-tts
10 changes: 10 additions & 0 deletions scripts/go/_internal/non-streaming-tts/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module non-streaming-tts

go 1.12

replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../

require (
github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx v0.0.0-00010101000000-000000000000
github.com/spf13/pflag v1.0.5
)
2 changes: 2 additions & 0 deletions scripts/go/_internal/non-streaming-tts/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
1 change: 1 addition & 0 deletions scripts/go/_internal/non-streaming-tts/main.go
113 changes: 113 additions & 0 deletions scripts/go/sherpa_onnx.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ Usage examples:
Please see
https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/streaming-decode-files
4. Convert text to speech using a non-streaming model
Please see
https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-tts
[sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
[onnxruntime]: https://github.com/microsoft/onnxruntime
[Next-gen Kaldi]: https://github.com/k2-fsa/
Expand Down Expand Up @@ -488,3 +493,111 @@ func (s *OfflineStream) GetResult() *OfflineRecognizerResult {

return result
}

// Configuration for offline/non-streaming text-to-speech (TTS).
//
// Please refer to
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html
// to download pre-trained models
type OfflineTtsVitsModelConfig struct {
Model string // Path to the VITS onnx model
Lexicon string // Path to lexicon.txt
Tokens string // Path to tokens.txt
NoiseScale float32 // noise scale for vits models. Please use 0.667 in general
NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
}

type OfflineTtsModelConfig struct {
Vits OfflineTtsVitsModelConfig

// Number of threads to use for neural network computation
NumThreads int

// 1 to print model meta information while loading
Debug int

// Optional. Valid values: cpu, cuda, coreml
Provider string
}

type OfflineTtsConfig struct {
Model OfflineTtsModelConfig
}

type GeneratedAudio struct {
// Normalized samples in the range [-1, 1]
Samples []float32

SampleRate int
}

// The offline tts class. It wraps a pointer from C.
type OfflineTts struct {
impl *C.struct_SherpaOnnxOfflineTts
}

// Free the internal pointer inside the tts to avoid memory leak.
func DeleteOfflineTts(tts *OfflineTts) {
C.SherpaOnnxDestroyOfflineTts(tts.impl)
tts.impl = nil
}

// The user is responsible to invoke [DeleteOfflineTts]() to free
// the returned tts to avoid memory leak
func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
c := C.struct_SherpaOnnxOfflineTtsConfig{}
c.model.vits.model = C.CString(config.Model.Vits.Model)
defer C.free(unsafe.Pointer(c.model.vits.model))

c.model.vits.lexicon = C.CString(config.Model.Vits.Lexicon)
defer C.free(unsafe.Pointer(c.model.vits.lexicon))

c.model.vits.tokens = C.CString(config.Model.Vits.Tokens)
defer C.free(unsafe.Pointer(c.model.vits.tokens))

c.model.vits.noise_scale = C.float(config.Model.Vits.NoiseScale)
c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW)
c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale)

c.model.num_threads = C.int(config.Model.NumThreads)
c.model.debug = C.int(config.Model.Debug)

c.model.provider = C.CString(config.Model.Provider)
defer C.free(unsafe.Pointer(c.model.provider))

tts := &OfflineTts{}
tts.impl = C.SherpaOnnxCreateOfflineTts(&c)

return tts
}

func (tts *OfflineTts) Generate(text string, sid int) *GeneratedAudio {
s := C.CString(text)
defer C.free(unsafe.Pointer(s))

audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid))

ans := &GeneratedAudio{}
ans.SampleRate = int(audio.sample_rate)
n := int(audio.n)
ans.Samples = make([]float32, n)
samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n]
// copy(ans.Samples, samples)
for i := 0; i < n; i++ {
ans.Samples[i] = float32(samples[i])
}

C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)

return ans
}

func (audio *GeneratedAudio) Save(filename string) int {
s := C.CString(filename)
defer C.free(unsafe.Pointer(s))

ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s))

return ok
}
6 changes: 3 additions & 3 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
}
}

int32_t SherpaOnnxDestroyOfflineWriteWave(const SherpaOnnxGeneratedAudio *p,
const char *filename) {
return sherpa_onnx::WriteWave(filename, p->sample_rate, p->samples, p->n);
int32_t SherpaOnnxWriteWave(const float *samples, int32_t n,
int32_t sample_rate, const char *filename) {
return sherpa_onnx::WriteWave(filename, sample_rate, samples, n);
}
5 changes: 3 additions & 2 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -648,8 +648,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
// The saved wave file contains a single channel and has 16-bit samples.
//
// Return 1 if the write succeeded; return 0 on failure.
SHERPA_ONNX_API int32_t SherpaOnnxDestroyOfflineWriteWave(
const SherpaOnnxGeneratedAudio *p, const char *filename);
SHERPA_ONNX_API int32_t SherpaOnnxWriteWave(const float *samples, int32_t n,
int32_t sample_rate,
const char *filename);

#if defined(__GNUC__)
#pragma GCC diagnostic pop
Expand Down

0 comments on commit df94b10

Please sign in to comment.