diff --git a/pyproject.toml b/pyproject.toml index 44feee7..54c10eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "elevenlabs" -version = "v1.4.1" +version = "v1.5.0" description = "" readme = "README.md" authors = [] diff --git a/src/elevenlabs/client.py b/src/elevenlabs/client.py index 74a3e0e..ffe63a9 100644 --- a/src/elevenlabs/client.py +++ b/src/elevenlabs/client.py @@ -14,6 +14,7 @@ PronunciationDictionaryVersionLocator, Model from .environment import ElevenLabsEnvironment from .realtime_tts import RealtimeTextToSpeechClient +from .types import OutputFormat DEFAULT_VOICE = Voice( @@ -124,7 +125,7 @@ def generate( model: Union[ModelId, Model] = "eleven_monolingual_v1", optimize_streaming_latency: typing.Optional[int] = 0, stream: bool = False, - output_format: Optional[str] = "mp3_44100_128", + output_format: Optional[OutputFormat] = "mp3_44100_128", pronunciation_dictionary_locators: typing.Optional[ typing.Sequence[PronunciationDictionaryVersionLocator] ] = OMIT, @@ -152,7 +153,7 @@ def generate( Defaults to False. - - output_format: typing.Optional[str]. Output format of the generated audio. Must be one of: + - output_format: typing.Optional[OutputFormat]. Output format of the generated audio. Must be one of: mp3_22050_32 - output format, mp3 with 22.05kHz sample rate at 32kbps. mp3_44100_32 - output format, mp3 with 44.1kHz sample rate at 32kbps. mp3_44100_64 - output format, mp3 with 44.1kHz sample rate at 64kbps. @@ -303,7 +304,7 @@ async def generate( model: Union[ModelId, Model] = "eleven_monolingual_v1", optimize_streaming_latency: typing.Optional[int] = 0, stream: bool = False, - output_format: Optional[str] = "mp3_44100_128", + output_format: Optional[OutputFormat] = "mp3_44100_128", pronunciation_dictionary_locators: typing.Optional[ typing.Sequence[PronunciationDictionaryVersionLocator] ] = OMIT, @@ -338,7 +339,7 @@ async def generate( Defaults to False. - - output_format: typing.Optional[str]. Output format of the generated audio. Must be one of: + - output_format: typing.Optional[OutputFormat]. Output format of the generated audio. Must be one of: mp3_22050_32 - output format, mp3 with 22.05kHz sample rate at 32kbps. mp3_44100_32 - output format, mp3 with 44.1kHz sample rate at 32kbps. mp3_44100_64 - output format, mp3 with 44.1kHz sample rate at 64kbps. diff --git a/src/elevenlabs/realtime_tts.py b/src/elevenlabs/realtime_tts.py index a168a88..146431d 100644 --- a/src/elevenlabs/realtime_tts.py +++ b/src/elevenlabs/realtime_tts.py @@ -14,6 +14,7 @@ from .core.request_options import RequestOptions from .types.voice_settings import VoiceSettings from .text_to_speech.client import TextToSpeechClient +from .types import OutputFormat # this is used as the default value for optional parameters OMIT = typing.cast(typing.Any, ...) @@ -45,6 +46,7 @@ def convert_realtime( *, text: typing.Iterator[str], model_id: typing.Optional[str] = OMIT, + output_format: typing.Optional[OutputFormat] = "mp3_44100_128", voice_settings: typing.Optional[VoiceSettings] = OMIT, request_options: typing.Optional[RequestOptions] = None, ) -> typing.Iterator[bytes]: @@ -86,7 +88,8 @@ def get_text() -> typing.Iterator[str]: """ with connect( urllib.parse.urljoin( - "wss://api.elevenlabs.io/", f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}" + "wss://api.elevenlabs.io/", + f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}&output_format={output_format}" ), additional_headers=jsonable_encoder( remove_none_from_dict(