Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Comment thread
devin-ai-integration[bot] marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@
SARVAM_STT_TRANSLATE_STREAMING_URL = "wss://api.sarvam.ai/speech-to-text-translate/ws"

# Models
SarvamSTTModels = Literal["saarika:v2.5", "saarika:v2.0", "saaras:v2.5"]
SarvamSTTModels = Literal["saarika:v2.5", "saaras:v3"]
SarvamSTTModes = Literal["transcribe", "translate", "verbatim", "translit", "codemix"]


class ConnectionState(enum.Enum):
Expand All @@ -73,6 +74,7 @@ class SarvamSTTOptions:
Args:
language: BCP-47 language code, e.g., "hi-IN", "en-IN"
model: The Sarvam STT model to use
mode: Mode for saaras:v3 (transcribe/translate/verbatim/translit/codemix)
base_url: API endpoint URL (auto-determined from model if not provided)
streaming_url: WebSocket streaming URL (auto-determined from model if not provided)
prompt: Optional prompt for STT translate (saaras models only)
Expand All @@ -81,13 +83,15 @@ class SarvamSTTOptions:
language: str # BCP-47 language code, e.g., "hi-IN", "en-IN"
api_key: str
model: SarvamSTTModels | str = "saarika:v2.5"
mode: SarvamSTTModes | str = "transcribe"
base_url: str | None = None
streaming_url: str | None = None
prompt: str | None = None # Optional prompt for STT translate (saaras models only)
high_vad_sensitivity: bool | None = None
sample_rate: int = 16000
flush_signal: bool | None = None
input_audio_codec: str | None = None
#mode: Literal["translate", "transcribe", "verbatim", "translit", "codemix"] = "transcribe"

def __post_init__(self) -> None:
"""Set URLs based on model if not explicitly provided."""
Expand All @@ -97,6 +101,20 @@ def __post_init__(self) -> None:
self.base_url = base_url
if self.streaming_url is None:
self.streaming_url = streaming_url
if self.model == "saaras:v3":
allowed_modes: set[str] = {
"transcribe",
"translate",
"verbatim",
"translit",
"codemix",
}
if self.mode not in allowed_modes:
raise ValueError(
"mode must be one of transcribe, translate, verbatim, translit, codemix"
)
else:
self.mode = "transcribe"
if self.sample_rate <= 0:
raise ValueError("sample_rate must be greater than zero")

Expand Down Expand Up @@ -151,6 +169,8 @@ def _build_websocket_url(base_url: str, opts: SarvamSTTOptions) -> str:
params["high_vad_sensitivity"] = str(opts.high_vad_sensitivity).lower()
if opts.flush_signal is not None:
params["flush_signal"] = str(opts.flush_signal).lower()
if opts.model == "saaras:v3":
params["mode"] = opts.mode
if opts.input_audio_codec:
params["input_audio_codec"] = opts.input_audio_codec

Expand All @@ -166,6 +186,7 @@ class STT(stt.STT):
Args:
language: BCP-47 language code, e.g., "hi-IN", "en-IN"
model: The Sarvam STT model to use
mode: Mode for saaras:v3 (transcribe/translate/verbatim/translit/codemix)
api_key: Sarvam.ai API key (falls back to SARVAM_API_KEY env var)
base_url: API endpoint URL
http_session: Optional aiohttp session to use
Expand All @@ -177,6 +198,7 @@ def __init__(
*,
language: str = "en-IN",
model: SarvamSTTModels | str = "saarika:v2.5",
mode: SarvamSTTModes | str = "transcribe",
api_key: str | None = None,
base_url: str | None = None,
http_session: aiohttp.ClientSession | None = None,
Expand Down Expand Up @@ -206,6 +228,7 @@ def __init__(
language=language,
api_key=self._api_key,
model=model,
mode=mode,
base_url=base_url,
prompt=prompt,
high_vad_sensitivity=high_vad_sensitivity,
Expand Down Expand Up @@ -236,6 +259,7 @@ async def _recognize_impl(
*,
language: NotGivenOr[str] = NOT_GIVEN,
model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN,
mode: NotGivenOr[SarvamSTTModes | str] = NOT_GIVEN,
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> stt.SpeechEvent:
"""Recognize speech using Sarvam.ai API.
Expand All @@ -254,8 +278,9 @@ async def _recognize_impl(
APIStatusError: On API errors (non-200 status)
APITimeoutError: On API timeout
"""
opts_language = self._opts.language if isinstance(language, type(NOT_GIVEN)) else language
opts_model = self._opts.model if isinstance(model, type(NOT_GIVEN)) else model
opts_language = self._opts.language if not is_given(language) else language
opts_model = self._opts.model if not is_given(model) else model
opts_mode = self._opts.mode if not is_given(mode) else mode

wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()

Expand All @@ -269,6 +294,8 @@ async def _recognize_impl(
form_data.add_field("language_code", opts_language)
if opts_model:
form_data.add_field("model", str(opts_model))
if opts_model == "saaras:v3":
form_data.add_field("mode", str(opts_mode))

if not self._api_key:
raise ValueError("API key cannot be None")
Expand Down Expand Up @@ -351,6 +378,7 @@ def stream(
*,
language: NotGivenOr[str] = NOT_GIVEN,
model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN,
mode: NotGivenOr[SarvamSTTModes | str] = NOT_GIVEN,
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
prompt: NotGivenOr[str] = NOT_GIVEN,
high_vad_sensitivity: NotGivenOr[bool] = NOT_GIVEN,
Expand All @@ -361,11 +389,14 @@ def stream(
"""Create a streaming transcription session."""
opts_language = language if is_given(language) else self._opts.language
opts_model = model if is_given(model) else self._opts.model
opts_mode = mode if is_given(mode) else self._opts.mode

if not isinstance(opts_language, str):
opts_language = self._opts.language
if not isinstance(opts_model, str):
opts_model = self._opts.model
if not isinstance(opts_mode, str):
opts_mode = self._opts.mode

# Handle prompt conversion from NotGiven to None
final_prompt: str | None
Expand All @@ -390,6 +421,7 @@ def stream(
language=opts_language,
api_key=self._api_key if self._api_key else "",
model=opts_model,
mode=opts_mode,
prompt=final_prompt,
high_vad_sensitivity=opts_high_vad,
sample_rate=opts_sample_rate,
Expand Down Expand Up @@ -524,24 +556,46 @@ async def aclose(self) -> None:
# Clear reference to help with garbage collection
pass # Session reference will be cleared when object is destroyed

def update_options(self, *, language: str, model: str, prompt: str | None = None) -> None:
def update_options(
self,
*,
language: str,
model: str,
prompt: str | None = None,
mode: str | None = None,
) -> None:
"""Update streaming options."""
if not language or not language.strip():
raise ValueError("Language cannot be empty")
if not model or not model.strip():
raise ValueError("Model cannot be empty")
if mode is not None and model == "saaras:v3":
allowed_modes: set[str] = {
"transcribe",
"translate",
"verbatim",
"translit",
"codemix",
}
if mode not in allowed_modes:
raise ValueError(
"mode must be one of transcribe, translate, verbatim, translit, codemix"
)

self._opts.language = language
self._opts.model = model
if prompt is not None:
self._opts.prompt = prompt
if mode is not None:
self._opts.mode = mode
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
self._logger.info(
"Options updated, triggering reconnection",
extra={
"session_id": self._session_id,
"language": language,
"model": model,
"prompt": prompt,
"mode": mode,
},
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
self._reconnect_event.set()
Expand Down
Comment thread
devin-ai-integration[bot] marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
SARVAM_TTS_WS_URL = "wss://api.sarvam.ai/text-to-speech/ws"

# Sarvam TTS specific models and speakers
SarvamTTSModels = Literal["bulbul:v2"]
SarvamTTSModels = Literal["bulbul:v2", "bulbul:v3-beta"]

# Supported languages in BCP-47 format
SarvamTTSLanguages = Literal[
Expand All @@ -74,6 +74,34 @@
"abhilash",
"karun",
"hitesh",
# bulbul:v3-beta Customer Care
"shubh",
"ritu",
"rahul",
"pooja",
"simran",
"kavya",
"amit",
"ratan",
"rohan",
"dev",
"ishita",
"shreya",
"manan",
"sumit",
"priya",
# bulbul:v3-beta Content Creation
"aditya",
"kabir",
"neha",
"varun",
"roopa",
"aayan",
"ashutosh",
"advait",
# bulbul:v3-beta International
"amelia",
"sophia",
]

# Model-Speaker compatibility mapping
Expand All @@ -82,7 +110,65 @@
"female": ["anushka", "manisha", "vidya", "arya"],
"male": ["abhilash", "karun", "hitesh"],
"all": ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"],
}
},
"bulbul:v3-beta": {
"female": [
"ritu",
"pooja",
"simran",
"kavya",
"ishita",
"shreya",
"priya",
"neha",
"roopa",
"amelia",
"sophia",
],
"male": [
"shubh",
"rahul",
"amit",
"ratan",
"rohan",
"dev",
"manan",
"sumit",
"aditya",
"kabir",
"varun",
"aayan",
"ashutosh",
"advait",
],
"all": [
"shubh",
"ritu",
"rahul",
"pooja",
"simran",
"kavya",
"amit",
"ratan",
"rohan",
"dev",
"ishita",
"shreya",
"manan",
"sumit",
"priya",
"aditya",
"kabir",
"neha",
"varun",
"roopa",
"aayan",
"ashutosh",
"advait",
"amelia",
"sophia",
],
},
}


Expand Down Expand Up @@ -313,7 +399,7 @@ def update_options(
if model is not None:
if not model.strip():
raise ValueError("Model cannot be empty")
if model not in ["bulbul:v2"]:
if model not in ["bulbul:v2", "bulbul:v3-beta"]:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this isn't a great pattern, because when you introduce new models, the plugin cannot be used without an update.

we recommend not hard blocking model lists in the plugin, your server should be the authority here

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair, I'll remove this check

raise ValueError(f"Unsupported model: {model}")
self._opts.model = model
Comment on lines 461 to 464
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 TTS update_options doesn't validate existing speaker compatibility when model is changed

When update_options is called with only model (without speaker), the existing speaker is not validated against the new model, potentially leaving the TTS in an invalid state.

Click to expand

How the bug is triggered

In update_options at tts.py:399-402, when model is updated, no validation is performed against the existing speaker:

if model is not None:
    if not model.strip():
        raise ValueError("Model cannot be empty")
    self._opts.model = model  # Speaker compatibility not checked!

The MODEL_SPEAKER_COMPATIBILITY mapping at tts.py:107-171 shows that bulbul:v2 and bulbul:v3-beta have completely different speaker sets with no overlap. For example, anushka is only valid for bulbul:v2, while shubh is only valid for bulbul:v3-beta.

Impact

If a TTS instance is created with model="bulbul:v2" and speaker="anushka", then update_options(model="bulbul:v3-beta") is called, the speaker remains anushka which is incompatible with bulbul:v3-beta. Subsequent TTS requests will fail at the API level with an incompatible speaker error.

Recommendation: After updating the model, validate that the current speaker is compatible with the new model. If not, either raise an error or reset to a default speaker for the new model.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


Comment on lines 461 to 478
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Revalidate the existing speaker when switching models.
update_options(model=...) can set bulbul:v3-beta while keeping an incompatible current speaker (e.g., "anushka"), and the mismatch isn’t checked unless speaker is also passed. This can surface as runtime API errors later.

🐛 Proposed fix
         if model is not None:
             if not model.strip():
                 raise ValueError("Model cannot be empty")
             if model not in ["bulbul:v2", "bulbul:v3-beta"]:
                 raise ValueError(f"Unsupported model: {model}")
             self._opts.model = model
+            if speaker is None and not validate_model_speaker_compatibility(
+                model, self._opts.speaker
+            ):
+                compatible = MODEL_SPEAKER_COMPATIBILITY.get(model, {}).get("all", [])
+                raise ValueError(
+                    f"Speaker '{self._opts.speaker}' is not compatible with model '{model}'. "
+                    "Please choose a compatible speaker from: "
+                    f"{', '.join(compatible)}"
+                )
🤖 Prompt for AI Agents
In `@livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/tts.py` around
lines 399 - 405, When update_options sets a new model (the block that assigns
self._opts.model), also revalidate the currently set speaker
(self._opts.speaker) if no new speaker is passed: check compatibility of the
existing speaker with the requested model and raise a ValueError if
incompatible. Implement this by adding a compatibility check (e.g., call a
helper like is_speaker_supported(model, self._opts.speaker) or inline logic)
immediately after setting model in update_options (the same scope where model,
speaker and self._opts.model are handled) so switching to "bulbul:v3-beta" with
an incompatible current speaker (e.g., "anushka") fails early rather than
causing runtime API errors.

Expand Down
Loading