-
Notifications
You must be signed in to change notification settings - Fork 508
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Text to speech API for Object Pascal. (#1273)
- Loading branch information
1 parent
e34a1a2
commit 5a2aa11
Showing
14 changed files
with
905 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
link*.res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
!run-*.sh | ||
piper | ||
piper-playback | ||
link*.res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Introduction | ||
|
||
This directory contains examples for how to use the TTS (text to speech) APIs. | ||
|
||
|Directory| Description| | ||
|---------|------------| | ||
|[run-piper.sh](./run-piper.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech.| | ||
|[run-piper-playback.sh](./run-piper-playback.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech. It plays the generated audio as it is still generating. | | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
{ Copyright (c) 2024 Xiaomi Corporation } | ||
program piper; | ||
{ | ||
This file shows how to use the text to speech API of sherpa-onnx | ||
with Piper models. | ||
It generates speech from text and saves it to a wave file. | ||
Note that it plays the audio back as it is still generating. | ||
} | ||
|
||
{$mode objfpc} | ||
|
||
uses | ||
{$ifdef unix} | ||
cthreads, | ||
{$endif} | ||
SysUtils, | ||
dos, | ||
ctypes, | ||
portaudio, | ||
sherpa_onnx; | ||
|
||
var | ||
CriticalSection: TRTLCriticalSection; | ||
|
||
Tts: TSherpaOnnxOfflineTts; | ||
Audio: TSherpaOnnxGeneratedAudio; | ||
Resampler: TSherpaOnnxLinearResampler; | ||
|
||
Text: AnsiString; | ||
Speed: Single = 1.0; {Use a larger value to speak faster} | ||
SpeakerId: Integer = 0; | ||
Buffer: TSherpaOnnxCircularBuffer; | ||
FinishedGeneration: Boolean = False; | ||
FinishedPlaying: Boolean = False; | ||
|
||
Version: String; | ||
EnvStr: String; | ||
Status: Integer; | ||
NumDevices: Integer; | ||
DeviceIndex: Integer; | ||
DeviceInfo: PPaDeviceInfo; | ||
|
||
{ If you get EDivByZero: Division by zero error, please change the sample rate | ||
to the one supported by your microphone. | ||
} | ||
DeviceSampleRate: Integer = 48000; | ||
I: Integer; | ||
Param: TPaStreamParameters; | ||
Stream: PPaStream; | ||
Wave: TSherpaOnnxWave; | ||
|
||
function GenerateCallback( | ||
Samples: pcfloat; N: cint32; | ||
Arg: Pointer): cint; cdecl; | ||
begin | ||
EnterCriticalSection(CriticalSection); | ||
try | ||
if Resampler <> nil then | ||
Buffer.Push(Resampler.Resample(Samples, N, False)) | ||
else | ||
Buffer.Push(Samples, N); | ||
finally | ||
LeaveCriticalSection(CriticalSection); | ||
end; | ||
|
||
{ 1 means to continue generating; 0 means to stop generating. } | ||
Result := 1; | ||
end; | ||
|
||
function PlayCallback( | ||
input: Pointer; output: Pointer; | ||
frameCount: culong; | ||
timeInfo: PPaStreamCallbackTimeInfo; | ||
statusFlags: TPaStreamCallbackFlags; | ||
userData: Pointer ): cint; cdecl; | ||
var | ||
Samples: TSherpaOnnxSamplesArray; | ||
I: Integer; | ||
begin | ||
EnterCriticalSection(CriticalSection); | ||
try | ||
if Buffer.Size >= frameCount then | ||
begin | ||
Samples := Buffer.Get(Buffer.Head, FrameCount); | ||
Buffer.Pop(FrameCount); | ||
end | ||
else if Buffer.Size > 0 then | ||
begin | ||
Samples := Buffer.Get(Buffer.Head, Buffer.Size); | ||
Buffer.Pop(Buffer.Size); | ||
SetLength(Samples, frameCount); | ||
end | ||
else | ||
SetLength(Samples, frameCount); | ||
|
||
for I := 0 to frameCount - 1 do | ||
pcfloat(output)[I] := Samples[I]; | ||
|
||
if (Buffer.Size > 0) or (not FinishedGeneration) then | ||
Result := paContinue | ||
else | ||
begin | ||
Result := paComplete; | ||
FinishedPlaying := True; | ||
end; | ||
finally | ||
LeaveCriticalSection(CriticalSection); | ||
end; | ||
end; | ||
|
||
function GetOfflineTts: TSherpaOnnxOfflineTts; | ||
var | ||
Config: TSherpaOnnxOfflineTtsConfig; | ||
begin | ||
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx'; | ||
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt'; | ||
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data'; | ||
Config.Model.NumThreads := 1; | ||
Config.Model.Debug := False; | ||
Config.MaxNumSentences := 1; | ||
|
||
Result := TSherpaOnnxOfflineTts.Create(Config); | ||
end; | ||
|
||
begin | ||
Tts := GetOfflineTts; | ||
if Tts.GetSampleRate <> DeviceSampleRate then | ||
Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate); | ||
|
||
Version := String(Pa_GetVersionText); | ||
WriteLn('Version is ', Version); | ||
Status := Pa_Initialize; | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status)); | ||
Exit; | ||
end; | ||
|
||
NumDevices := Pa_GetDeviceCount; | ||
WriteLn('Num devices: ', NumDevices); | ||
|
||
DeviceIndex := Pa_GetDefaultOutputDevice; | ||
|
||
if DeviceIndex = paNoDevice then | ||
begin | ||
WriteLn('No default output device found'); | ||
Pa_Terminate; | ||
Exit; | ||
end; | ||
|
||
EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE'); | ||
if EnvStr <> '' then | ||
begin | ||
DeviceIndex := StrToIntDef(EnvStr, DeviceIndex); | ||
WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr); | ||
end; | ||
|
||
for I := 0 to (NumDevices - 1) do | ||
begin | ||
DeviceInfo := Pa_GetDeviceInfo(I); | ||
if I = DeviceIndex then | ||
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) } | ||
WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)])) | ||
else | ||
WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)])); | ||
end; | ||
|
||
WriteLn('Use device ', DeviceIndex); | ||
WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name); | ||
WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels); | ||
|
||
Initialize(Param); | ||
Param.Device := DeviceIndex; | ||
Param.ChannelCount := 1; | ||
Param.SampleFormat := paFloat32; | ||
param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency; | ||
param.HostApiSpecificStreamInfo := nil; | ||
|
||
Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate); | ||
|
||
|
||
{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. } | ||
Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag, | ||
PPaStreamCallback(@PlayCallback), nil); | ||
|
||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to open stream, ', Pa_GetErrorText(Status)); | ||
Pa_Terminate; | ||
Exit; | ||
end; | ||
|
||
InitCriticalSection(CriticalSection); | ||
|
||
Status := Pa_StartStream(stream); | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to start stream, ', Pa_GetErrorText(Status)); | ||
Pa_Terminate; | ||
Exit; | ||
end; | ||
|
||
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); | ||
|
||
Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'; | ||
|
||
Audio := Tts.Generate(Text, SpeakerId, Speed, | ||
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); | ||
FinishedGeneration := True; | ||
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate); | ||
WriteLn('Saved to ./libritts_r-generated.wav'); | ||
|
||
while not FinishedPlaying do | ||
Pa_Sleep(100); {sleep for 0.1 second } | ||
{TODO(fangjun): Use an event to indicate the play is finished} | ||
|
||
DoneCriticalSection(CriticalSection); | ||
|
||
FreeAndNil(Tts); | ||
FreeAndNil(Resampler); | ||
|
||
Status := Pa_CloseStream(stream); | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to close stream, ', Pa_GetErrorText(Status)); | ||
Exit; | ||
end; | ||
|
||
Status := Pa_Terminate; | ||
if Status <> paNoError then | ||
begin | ||
WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status)); | ||
Exit; | ||
end; | ||
end. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
{ Copyright (c) 2024 Xiaomi Corporation } | ||
program piper; | ||
{ | ||
This file shows how to use the text to speech API of sherpa-onnx | ||
with Piper models. | ||
It generates speech from text and saves it to a wave file. | ||
If you want to play it while it is generating, please see | ||
./piper-playback.pas | ||
} | ||
|
||
{$mode objfpc} | ||
|
||
uses | ||
SysUtils, | ||
sherpa_onnx; | ||
|
||
function GetOfflineTts: TSherpaOnnxOfflineTts; | ||
var | ||
Config: TSherpaOnnxOfflineTtsConfig; | ||
begin | ||
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx'; | ||
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt'; | ||
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data'; | ||
Config.Model.NumThreads := 1; | ||
Config.Model.Debug := False; | ||
Config.MaxNumSentences := 1; | ||
|
||
Result := TSherpaOnnxOfflineTts.Create(Config); | ||
end; | ||
|
||
var | ||
Tts: TSherpaOnnxOfflineTts; | ||
Audio: TSherpaOnnxGeneratedAudio; | ||
|
||
Text: AnsiString; | ||
Speed: Single = 1.0; {Use a larger value to speak faster} | ||
SpeakerId: Integer = 0; | ||
|
||
begin | ||
Tts := GetOfflineTts; | ||
|
||
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); | ||
|
||
Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'; | ||
|
||
Audio := Tts.Generate(Text, SpeakerId, Speed); | ||
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate); | ||
WriteLn('Saved to ./libritts_r-generated.wav'); | ||
|
||
FreeAndNil(Tts); | ||
end. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) | ||
|
||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" | ||
|
||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then | ||
mkdir -p ../../build | ||
pushd ../../build | ||
cmake \ | ||
-DCMAKE_INSTALL_PREFIX=./install \ | ||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
-DBUILD_SHARED_LIBS=ON \ | ||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
.. | ||
|
||
cmake --build . --target install --config Release | ||
popd | ||
fi | ||
|
||
if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2 | ||
tar xf vits-piper-en_US-libritts_r-medium.tar.bz2 | ||
rm vits-piper-en_US-libritts_r-medium.tar.bz2 | ||
fi | ||
|
||
fpc \ | ||
-dSHERPA_ONNX_USE_SHARED_LIBS \ | ||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ | ||
-Fl$SHERPA_ONNX_DIR/build/install/lib \ | ||
-Fl/usr/local/Cellar/portaudio/19.7.0/lib \ | ||
./piper-playback.pas | ||
|
||
# Please see ../portaudio-test/README.md | ||
# for how to install portaudio on macOS | ||
|
||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH | ||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH | ||
|
||
./piper-playback |
Oops, something went wrong.