Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions livekit-agents/livekit/agents/inference/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,32 +34,39 @@


OpenAIModels = Literal[
"openai/gpt-5",
"openai/gpt-5-mini",
"openai/gpt-5-nano",
"openai/gpt-4o",
"openai/gpt-4o-mini",
"openai/gpt-4.1",
"openai/gpt-4.1-mini",
"openai/gpt-4.1-nano",
"openai/gpt-4o",
"openai/gpt-4o-mini",
"openai/gpt-5",
"openai/gpt-5-mini",
"openai/gpt-5-nano",
"openai/gpt-5.1",
"openai/gpt-5.1-chat-latest",
"openai/gpt-5.2",
"openai/gpt-5.2-chat-latest",
"openai/gpt-oss-120b",
]

GoogleModels = Literal[
"google/gemini-3-pro",
"google/gemini-3-flash",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we clarify they are preview versions?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think our docs page should be the authority here.

"google/gemini-2.5-pro",
"google/gemini-2.5-flash",
"google/gemini-2.5-flash-lite",
"google/gemini-2.0-flash",
"google/gemini-2.0-flash-lite",
]

QwenModels = Literal["qwen/qwen3-235b-a22b-instruct"]

KimiModels = Literal["moonshotai/kimi-k2-instruct"]

DeepSeekModels = Literal["deepseek-ai/deepseek-v3"]
DeepSeekModels = Literal[
"deepseek-ai/deepseek-v3",
"deepseek-ai/deepseek-v3.2",
]

LLMModels = Union[OpenAIModels, GoogleModels, QwenModels, KimiModels, DeepSeekModels]
LLMModels = Union[OpenAIModels, GoogleModels, KimiModels, DeepSeekModels]


class ChatCompletionOptions(TypedDict, total=False):
Expand Down
43 changes: 39 additions & 4 deletions livekit-agents/livekit/agents/inference/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,21 @@
from ._utils import create_access_token

DeepgramModels = Literal[
"deepgram/flux-general",
"deepgram/flux-general-en",
"deepgram/nova-3",
"deepgram/nova-3-general",
"deepgram/nova-3-medical",
"deepgram/nova-2",
"deepgram/nova-2-general",
"deepgram/nova-2-medical",
"deepgram/nova-2-conversationalai",
"deepgram/nova-2-phonecall",
]
CartesiaModels = Literal["cartesia/ink-whisper",]
AssemblyAIModels = Literal["assemblyai/universal-streaming",]
AssemblyAIModels = Literal[
"assemblyai/universal-streaming",
"assemblyai/universal-streaming-multilingual",
]
ElevenlabsModels = Literal["elevenlabs/scribe_v2_realtime",]


class CartesiaOptions(TypedDict, total=False):
Expand Down Expand Up @@ -67,6 +71,15 @@ class AssemblyaiOptions(TypedDict, total=False):
keyterms_prompt: list[str] # default: not specified


class ElevenlabsOptions(TypedDict, total=False):
commit_strategy: Literal["manual", "vad"]
include_timestamps: bool
vad_silence_threshold_secs: float
vad_threshold: float
min_speech_duration_ms: int
min_silence_duration_ms: int


STTLanguages = Literal["multi", "en", "de", "es", "fr", "ja", "pt", "zh", "hi"]


Expand Down Expand Up @@ -116,6 +129,7 @@ def _make_fallback(model: FallbackModelType) -> FallbackModel:
DeepgramModels,
CartesiaModels,
AssemblyAIModels,
ElevenlabsModels,
Literal["auto"], # automatically select a provider based on the language
]
STTEncoding = Literal["pcm_s16le"]
Expand Down Expand Up @@ -192,6 +206,23 @@ def __init__(
conn_options: NotGivenOr[APIConnectOptions] = NOT_GIVEN,
) -> None: ...

@overload
def __init__(
self,
model: ElevenlabsModels,
*,
language: NotGivenOr[str] = NOT_GIVEN,
base_url: NotGivenOr[str] = NOT_GIVEN,
encoding: NotGivenOr[STTEncoding] = NOT_GIVEN,
sample_rate: NotGivenOr[int] = NOT_GIVEN,
api_key: NotGivenOr[str] = NOT_GIVEN,
api_secret: NotGivenOr[str] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None,
extra_kwargs: NotGivenOr[ElevenlabsOptions] = NOT_GIVEN,
fallback: NotGivenOr[list[FallbackModelType] | FallbackModelType] = NOT_GIVEN,
conn_options: NotGivenOr[APIConnectOptions] = NOT_GIVEN,
) -> None: ...

@overload
def __init__(
self,
Expand Down Expand Up @@ -221,7 +252,11 @@ def __init__(
api_secret: NotGivenOr[str] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None,
extra_kwargs: NotGivenOr[
dict[str, Any] | CartesiaOptions | DeepgramOptions | AssemblyaiOptions
dict[str, Any]
| CartesiaOptions
| DeepgramOptions
| AssemblyaiOptions
| ElevenlabsOptions
] = NOT_GIVEN,
fallback: NotGivenOr[list[FallbackModelType] | FallbackModelType] = NOT_GIVEN,
conn_options: NotGivenOr[APIConnectOptions] = NOT_GIVEN,
Expand Down
51 changes: 44 additions & 7 deletions livekit-agents/livekit/agents/inference/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@

CartesiaModels = Literal[
"cartesia",
"cartesia/sonic",
"cartesia/sonic-3",
"cartesia/sonic-2",
"cartesia/sonic-turbo",
"cartesia/sonic",
]
DeepgramModels = Literal[
"deepgram",
"deepgram/aura",
"deepgram/aura-2",
]
ElevenlabsModels = Literal[
"elevenlabs",
Expand All @@ -34,16 +40,18 @@
]
RimeModels = Literal[
"rime",
"rime/mist",
"rime/mistv2",
"rime/arcana",
"rime/mistv2",
]
InworldModels = Literal[
"inworld",
"inworld/inworld-tts-1.5-max",
"inworld/inworld-tts-1.5-mini",
"inworld/inworld-tts-1-max",
"inworld/inworld-tts-1",
]

TTSModels = Union[CartesiaModels, ElevenlabsModels, RimeModels, InworldModels]
TTSModels = Union[CartesiaModels, DeepgramModels, ElevenlabsModels, RimeModels, InworldModels]


def _parse_model_string(model: str) -> tuple[str, str | None]:
Expand Down Expand Up @@ -98,8 +106,13 @@ def _make_fallback(model: FallbackModelType) -> FallbackModel:


class CartesiaOptions(TypedDict, total=False):
duration: float # max duration of audio in seconds
speed: Literal["slow", "normal", "fast"] # default: not specified
emotion: str
speed: Literal["slow", "normal", "fast"]
volume: float

Comment on lines 108 to +112
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

# First, let's examine the file context to understand the CartesiaOptions usage
cat -n livekit-agents/livekit/agents/inference/tts.py | head -130 | tail -40

Repository: livekit/agents

Length of output: 172


🏁 Script executed:

# Also check if there are any other references to CartesiaOptions in the codebase
rg "CartesiaOptions" --type py -B 2 -A 5

Repository: livekit/agents

Length of output: 4238


🌐 Web query:

Cartesia TTS API documentation options parameters

💡 Result:

Below are the main Cartesia TTS API request options/parameters (from the official docs).

Common (all TTS endpoints)

  • Header: Cartesia-Version (required). Supported versions shown in the API reference include 2024-06-10, 2024-11-13, 2025-04-16. [1][2][3]
  • Auth:
    • /tts/bytes, /tts/sse: Authorization: Bearer <access_token> (per examples). [2][3]
    • WebSocket: API key can be provided via header (X-API-Key) or via query param api_key (useful in browsers). [4]

POST /tts/bytes (stream audio bytes)

Body fields:

  • model_id (required) [2]
  • transcript (required) [2]
  • voice (required): { "mode": "id", "id": "<voice_id>" } [2]
  • output_format (required) [2]
  • language (optional enum) [2]
  • generation_config (optional; only affects sonic-3): includes volume, speed, emotion [2]
  • save (optional bool; default false): when true, response includes Cartesia-File-ID header [2]
  • pronunciation_dict_id (optional; supported by sonic-3 and newer) [2]
  • speed (deprecated enum: slow|normal|fast; prefer generation_config.speed for sonic-3) [2]

POST /tts/sse (Server-Sent Events streaming)

Includes the same core fields as /tts/bytes (model_id, transcript, voice, output_format, language, generation_config, pronunciation_dict_id) plus streaming/timestamps controls: [3]

  • add_timestamps (bool; word timestamps) [3]
  • add_phoneme_timestamps (bool) [3]
  • use_normalized_timestamps (bool) [3]
  • context_id (optional string) [3]
  • speed (deprecated; same as above) [3]

GET wss://api.cartesia.ai/tts/websocket (bidirectional streaming)

Handshake query params (useful when headers aren’t available):

  • cartesia_version (alternative to Cartesia-Version) [4]
  • api_key (alternative to X-API-Key) [4]

Generation request message example fields include: model_id, transcript, voice, language, context_id, output_format, add_timestamps, continue (for continuation on the same context_id). [4]

output_format options (containers/encodings/sample rates)

The docs show these common options for output_format: [5]

  • container: raw | wav | mp3
  • sample_rate: 8000, 16000, 22050, 24000, 44100, 48000
  • encoding (required for raw and wav): pcm_f32le, pcm_s16le, pcm_mulaw, pcm_alaw
  • bit_rate (required for mp3)

Sources: [1] API conventions/versioning, [2] /tts/bytes, [3] /tts/sse, [4] /tts/websocket, [5] output format enums (shown on Cartesia API reference pages).


🏁 Script executed:

# Search for how CartesiaOptions is used/sent to the Cartesia API
rg "generation_config|CartesiaOptions" --type py -B 3 -A 3

Repository: livekit/agents

Length of output: 10488


🏁 Script executed:

# Look for Cartesia API client calls or serialization of CartesiaOptions
rg "cartesia|emotion|tts" livekit-agents/livekit/agents/inference/tts.py --type py -i -A 2

Repository: livekit/agents

Length of output: 6387


Speed and emotion types don't match Cartesia plugin implementation.

The CartesiaOptions TypedDict defines speed as Literal["slow", "normal", "fast"], but the Cartesia plugin expects floats for sonic-3 models (if not isinstance(self._opts.speed, float)). Additionally, emotion is typed as str but accessed as opts.emotion[0] in the plugin, suggesting indexing that doesn't align with a plain string type. Volume as float is correct. Update the TypedDict to match actual plugin expectations.

🤖 Prompt for AI Agents
In `@livekit-agents/livekit/agents/inference/tts.py` around lines 108 - 112, The
CartesiaOptions TypedDict currently mismatches the Cartesia plugin: change the
speed field from Literal["slow","normal","fast"] to a numeric type (float)
because the plugin checks `isinstance(self._opts.speed, float)` for sonic-3
models, and change emotion from `str` to a sequence type (e.g., `Sequence[str]`
or `List[str]`) because the plugin accesses `opts.emotion[0]`; keep volume as
`float`. Update the CartesiaOptions definition accordingly so it aligns with the
plugin's expectations.


class DeepgramOptions(TypedDict, total=False):
pass


class ElevenlabsOptions(TypedDict, total=False):
Expand Down Expand Up @@ -157,6 +170,25 @@ def __init__(
) -> None:
pass

@overload
def __init__(
self,
model: DeepgramModels,
*,
voice: NotGivenOr[str] = NOT_GIVEN,
language: NotGivenOr[str] = NOT_GIVEN,
encoding: NotGivenOr[TTSEncoding] = NOT_GIVEN,
sample_rate: NotGivenOr[int] = NOT_GIVEN,
base_url: NotGivenOr[str] = NOT_GIVEN,
api_key: NotGivenOr[str] = NOT_GIVEN,
api_secret: NotGivenOr[str] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None,
extra_kwargs: NotGivenOr[DeepgramOptions] = NOT_GIVEN,
fallback: NotGivenOr[list[FallbackModelType] | FallbackModelType] = NOT_GIVEN,
conn_options: NotGivenOr[APIConnectOptions] = NOT_GIVEN,
) -> None:
pass

@overload
def __init__(
self,
Expand Down Expand Up @@ -246,7 +278,12 @@ def __init__(
api_secret: NotGivenOr[str] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None,
extra_kwargs: NotGivenOr[
dict[str, Any] | CartesiaOptions | ElevenlabsOptions | RimeOptions | InworldOptions
dict[str, Any]
| CartesiaOptions
| DeepgramOptions
| ElevenlabsOptions
| RimeOptions
| InworldOptions
] = NOT_GIVEN,
fallback: NotGivenOr[list[FallbackModelType] | FallbackModelType] = NOT_GIVEN,
conn_options: NotGivenOr[APIConnectOptions] = NOT_GIVEN,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_inference_tts_fallback.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_provider_model_format_with_voice(self):
[
("elevenlabs/eleven_flash_v2:voice123", "elevenlabs/eleven_flash_v2", "voice123"),
("rime:speaker-a", "rime", "speaker-a"),
("rime/mist:narrator", "rime/mist", "narrator"),
("rime/mistv2:narrator", "rime/mistv2", "narrator"),
("inworld/inworld-tts-1:character", "inworld/inworld-tts-1", "character"),
("cartesia/sonic-turbo:deep-voice", "cartesia/sonic-turbo", "deep-voice"),
],
Expand Down
Loading